Skip to content

Commit

Permalink
Update ner-with-haystack.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
lfunderburk committed Dec 4, 2024
1 parent 984a5c5 commit cf3f70a
Showing 1 changed file with 191 additions and 2 deletions.
193 changes: 191 additions & 2 deletions ch8/ner-with-haystack.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,12 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Function to extract entity annotations into a DataFrame\n",
"# Function to extract uniquely identified named entities into a DataFrame with URL\n",
"def extract_named_entities_with_ids_and_url(documents):\n",
" extracted_data = []\n",
Expand Down Expand Up @@ -139,6 +138,196 @@
"df_entities.drop_duplicates(subset=['word', 'entity_type','score'], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>document_id</th>\n",
" <th>word</th>\n",
" <th>entity_type</th>\n",
" <th>score</th>\n",
" <th>url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...</td>\n",
" <td>Chinese New Year</td>\n",
" <td>MISC</td>\n",
" <td>0.870644</td>\n",
" <td>https://www.britannica.com/topic/Chinese-New-Year</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...</td>\n",
" <td>Lunar New Year</td>\n",
" <td>MISC</td>\n",
" <td>0.915542</td>\n",
" <td>https://www.britannica.com/topic/Chinese-New-Year</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...</td>\n",
" <td>China</td>\n",
" <td>LOC</td>\n",
" <td>0.918623</td>\n",
" <td>https://www.britannica.com/topic/Chinese-New-Year</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...</td>\n",
" <td>Chinese</td>\n",
" <td>MISC</td>\n",
" <td>0.664121</td>\n",
" <td>https://www.britannica.com/topic/Chinese-New-Year</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...</td>\n",
" <td>Lunar New Year</td>\n",
" <td>MISC</td>\n",
" <td>0.868886</td>\n",
" <td>https://www.britannica.com/topic/Chinese-New-Year</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" document_id word \\\n",
"0 eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200... Chinese New Year \n",
"1 eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200... Lunar New Year \n",
"2 eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200... China \n",
"3 eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200... Chinese \n",
"4 eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200... Lunar New Year \n",
"\n",
" entity_type score url \n",
"0 MISC 0.870644 https://www.britannica.com/topic/Chinese-New-Year \n",
"1 MISC 0.915542 https://www.britannica.com/topic/Chinese-New-Year \n",
"2 LOC 0.918623 https://www.britannica.com/topic/Chinese-New-Year \n",
"3 MISC 0.664121 https://www.britannica.com/topic/Chinese-New-Year \n",
"4 MISC 0.868886 https://www.britannica.com/topic/Chinese-New-Year "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_entities.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>98.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.898904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.140579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.374084</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.870194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.961658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.999651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>0.999793</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score\n",
"count 98.000000\n",
"mean 0.898904\n",
"std 0.140579\n",
"min 0.374084\n",
"25% 0.870194\n",
"50% 0.961658\n",
"75% 0.999651\n",
"max 0.999793"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_entities.describe()"
]
},
{
"cell_type": "code",
"execution_count": 44,
Expand Down

0 comments on commit cf3f70a

Please sign in to comment.