Skip to content

Commit

Permalink
complete ner with Haystack mini project
Browse files Browse the repository at this point in the history
  • Loading branch information
lfunderburk committed Dec 4, 2024
1 parent 5a80bbf commit 984a5c5
Showing 1 changed file with 104 additions and 41 deletions.
145 changes: 104 additions & 41 deletions ch8/ner-with-haystack.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,65 +105,128 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'content_type': 'text/html',\n",
" 'url': 'https://www.britannica.com/topic/Chinese-New-Year',\n",
" 'named_entities': [NamedEntityAnnotation(entity='MISC', start=0, end=16, score=np.float32(0.8706437)),\n",
" NamedEntityAnnotation(entity='MISC', start=30, end=44, score=np.float32(0.91554195)),\n",
" NamedEntityAnnotation(entity='LOC', start=61, end=66, score=np.float32(0.9186233)),\n",
" NamedEntityAnnotation(entity='MISC', start=67, end=74, score=np.float32(0.66412055)),\n",
" NamedEntityAnnotation(entity='MISC', start=82, end=96, score=np.float32(0.8688859)),\n",
" NamedEntityAnnotation(entity='MISC', start=121, end=137, score=np.float32(0.9623936)),\n",
" NamedEntityAnnotation(entity='LOC', start=165, end=170, score=np.float32(0.99973375)),\n",
" NamedEntityAnnotation(entity='MISC', start=175, end=182, score=np.float32(0.99974746)),\n",
" NamedEntityAnnotation(entity='MISC', start=311, end=318, score=np.float32(0.9997546)),\n",
" NamedEntityAnnotation(entity='MISC', start=414, end=428, score=np.float32(0.9671516)),\n",
" NamedEntityAnnotation(entity='LOC', start=523, end=528, score=np.float32(0.9997794)),\n",
" NamedEntityAnnotation(entity='MISC', start=588, end=604, score=np.float32(0.9242876)),\n",
" NamedEntityAnnotation(entity='MISC', start=650, end=665, score=np.float32(0.7919462)),\n",
" NamedEntityAnnotation(entity='MISC', start=713, end=729, score=np.float32(0.9364104)),\n",
" NamedEntityAnnotation(entity='MISC', start=761, end=777, score=np.float32(0.9695166)),\n",
" NamedEntityAnnotation(entity='PER', start=859, end=861, score=np.float32(0.73694575)),\n",
" NamedEntityAnnotation(entity='MISC', start=865, end=871, score=np.float32(0.7323043)),\n",
" NamedEntityAnnotation(entity='MISC', start=1304, end=1320, score=np.float32(0.9760999)),\n",
" NamedEntityAnnotation(entity='MISC', start=1443, end=1459, score=np.float32(0.93961865)),\n",
" NamedEntityAnnotation(entity='MISC', start=1673, end=1689, score=np.float32(0.93505925)),\n",
" NamedEntityAnnotation(entity='MISC', start=1704, end=1720, score=np.float32(0.92978096)),\n",
" NamedEntityAnnotation(entity='MISC', start=1838, end=1845, score=np.float32(0.9997471))]}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"extracted_documents[0].meta"
"import pandas as pd\n",
"\n",
"# Function to extract entity annotations into a DataFrame\n",
"# Function to extract uniquely identified named entities into a DataFrame with URL\n",
"def extract_named_entities_with_ids_and_url(documents):\n",
" extracted_data = []\n",
" for document in documents:\n",
" content = document.content\n",
" doc_id = document.id\n",
" url = document.meta.get('url', 'N/A') # Default to 'N/A' if URL is not available\n",
" named_entities = document.meta.get('named_entities', [])\n",
" for entity in named_entities:\n",
" word = content[entity.start:entity.end]\n",
" extracted_data.append({\n",
" 'document_id': doc_id,\n",
" 'word': word,\n",
" 'entity_type': entity.entity,\n",
" 'score': float(entity.score),\n",
" 'url': url\n",
" })\n",
" \n",
" # Convert to pandas DataFrame\n",
" df = pd.DataFrame(extracted_data)\n",
" return df\n",
"\n",
"# Extract and display named entities with unique IDs and URLs\n",
"df_entities = extract_named_entities_with_ids_and_url(extracted_documents)\n",
"df_entities.drop_duplicates(subset=['word', 'entity_type','score'], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>document_id</th>\n",
" <th>word</th>\n",
" <th>score</th>\n",
" <th>url</th>\n",
" </tr>\n",
" <tr>\n",
" <th>entity_type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>LOC</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MISC</th>\n",
" <td>55</td>\n",
" <td>55</td>\n",
" <td>55</td>\n",
" <td>55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ORG</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PER</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"'Chinese New Year Also called: Lunar New Year Related Topics: China Chinese zodiac Lunar New Year January February\\nNews •\\nChinese New Year, annual 15-day festival in China and Chinese communities around the world that begins with the new moon that occurs sometime between January 21 and February 20 according to Western calendars. Festivities last until the following full moon.\\nThe holiday is sometimes called the Lunar New Year because the dates of celebration follow the phases of the moon. Since the mid-1990s people in China have been given seven consecutive days off work during the Chinese New Year. This week of relaxation has been designated Spring Festival, a term that is sometimes used to refer to the Chinese New Year in general.\\nThe origins of the Chinese New Year are steeped in legend. One legend is that thousands of years ago a monster named Nian (“Year”) would attack villagers at the beginning of each new year. The monster was afraid of loud noises, bright lights, and the colour red, so those things were used to chase the beast away. Celebrations to usher out the old year and bring forth the luck and prosperity of the new one, therefore, often include firecrackers, fireworks, and red clothes and decorations. Young people are given money in colourful red envelopes. In addition, Chinese New Year is a time to feast and to visit family members. Many traditions of the season honour relatives who have died.\\nAmong other Chinese New Year traditions is the thorough cleaning of one’s home to rid the resident of any lingering bad luck. Some people prepare and enjoy special foods on certain days during the celebrations. The last event held during the Chinese New Year is called the Lantern Festival, during which people hang glowing lanterns in temples or carry them during a nighttime parade. Since the dragon is a Chinese symbol of good fortune, a dragon dance highlights festival celebrations in many areas. This procession involves a long, colourful dragon being carried through the streets by numerous dancers.'"
" document_id word score url\n",
"entity_type \n",
"LOC 5 5 5 5\n",
"MISC 55 55 55 55\n",
"ORG 6 6 6 6\n",
"PER 4 4 4 4"
]
},
"execution_count": 10,
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extracted_documents[0].content"
"df_entities[df_entities['score']>0.9].groupby(\"entity_type\").count()"
]
},
{
Expand Down

0 comments on commit 984a5c5

Please sign in to comment.