From 984a5c52386cf6fdac14cfa4713f6e1fc9bc8dd6 Mon Sep 17 00:00:00 2001 From: Laura Gutierrez Funderburk Date: Tue, 3 Dec 2024 17:20:36 -0800 Subject: [PATCH] complete ner with Haystack mini project --- ch8/ner-with-haystack.ipynb | 145 ++++++++++++++++++++++++++---------- 1 file changed, 104 insertions(+), 41 deletions(-) diff --git a/ch8/ner-with-haystack.ipynb b/ch8/ner-with-haystack.ipynb index 535b773..bf29672 100644 --- a/ch8/ner-with-haystack.ipynb +++ b/ch8/ner-with-haystack.ipynb @@ -105,65 +105,128 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 37, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'content_type': 'text/html',\n", - " 'url': 'https://www.britannica.com/topic/Chinese-New-Year',\n", - " 'named_entities': [NamedEntityAnnotation(entity='MISC', start=0, end=16, score=np.float32(0.8706437)),\n", - " NamedEntityAnnotation(entity='MISC', start=30, end=44, score=np.float32(0.91554195)),\n", - " NamedEntityAnnotation(entity='LOC', start=61, end=66, score=np.float32(0.9186233)),\n", - " NamedEntityAnnotation(entity='MISC', start=67, end=74, score=np.float32(0.66412055)),\n", - " NamedEntityAnnotation(entity='MISC', start=82, end=96, score=np.float32(0.8688859)),\n", - " NamedEntityAnnotation(entity='MISC', start=121, end=137, score=np.float32(0.9623936)),\n", - " NamedEntityAnnotation(entity='LOC', start=165, end=170, score=np.float32(0.99973375)),\n", - " NamedEntityAnnotation(entity='MISC', start=175, end=182, score=np.float32(0.99974746)),\n", - " NamedEntityAnnotation(entity='MISC', start=311, end=318, score=np.float32(0.9997546)),\n", - " NamedEntityAnnotation(entity='MISC', start=414, end=428, score=np.float32(0.9671516)),\n", - " NamedEntityAnnotation(entity='LOC', start=523, end=528, score=np.float32(0.9997794)),\n", - " NamedEntityAnnotation(entity='MISC', start=588, end=604, score=np.float32(0.9242876)),\n", - " NamedEntityAnnotation(entity='MISC', start=650, end=665, score=np.float32(0.7919462)),\n", - " NamedEntityAnnotation(entity='MISC', start=713, end=729, score=np.float32(0.9364104)),\n", - " NamedEntityAnnotation(entity='MISC', start=761, end=777, score=np.float32(0.9695166)),\n", - " NamedEntityAnnotation(entity='PER', start=859, end=861, score=np.float32(0.73694575)),\n", - " NamedEntityAnnotation(entity='MISC', start=865, end=871, score=np.float32(0.7323043)),\n", - " NamedEntityAnnotation(entity='MISC', start=1304, end=1320, score=np.float32(0.9760999)),\n", - " NamedEntityAnnotation(entity='MISC', start=1443, end=1459, score=np.float32(0.93961865)),\n", - " NamedEntityAnnotation(entity='MISC', start=1673, end=1689, score=np.float32(0.93505925)),\n", - " NamedEntityAnnotation(entity='MISC', start=1704, end=1720, score=np.float32(0.92978096)),\n", - " NamedEntityAnnotation(entity='MISC', start=1838, end=1845, score=np.float32(0.9997471))]}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "extracted_documents[0].meta" + "import pandas as pd\n", + "\n", + "# Function to extract entity annotations into a DataFrame\n", + "# Function to extract uniquely identified named entities into a DataFrame with URL\n", + "def extract_named_entities_with_ids_and_url(documents):\n", + " extracted_data = []\n", + " for document in documents:\n", + " content = document.content\n", + " doc_id = document.id\n", + " url = document.meta.get('url', 'N/A') # Default to 'N/A' if URL is not available\n", + " named_entities = document.meta.get('named_entities', [])\n", + " for entity in named_entities:\n", + " word = content[entity.start:entity.end]\n", + " extracted_data.append({\n", + " 'document_id': doc_id,\n", + " 'word': word,\n", + " 'entity_type': entity.entity,\n", + " 'score': float(entity.score),\n", + " 'url': url\n", + " })\n", + " \n", + " # Convert to pandas DataFrame\n", + " df = pd.DataFrame(extracted_data)\n", + " return df\n", + "\n", + "# Extract and display named entities with unique IDs and URLs\n", + "df_entities = extract_named_entities_with_ids_and_url(extracted_documents)\n", + "df_entities.drop_duplicates(subset=['word', 'entity_type','score'], inplace=True)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 44, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
document_idwordscoreurl
entity_type
LOC5555
MISC55555555
ORG6666
PER4444
\n", + "
" + ], "text/plain": [ - "'Chinese New Year Also called: Lunar New Year Related Topics: China Chinese zodiac Lunar New Year January February\\nNews •\\nChinese New Year, annual 15-day festival in China and Chinese communities around the world that begins with the new moon that occurs sometime between January 21 and February 20 according to Western calendars. Festivities last until the following full moon.\\nThe holiday is sometimes called the Lunar New Year because the dates of celebration follow the phases of the moon. Since the mid-1990s people in China have been given seven consecutive days off work during the Chinese New Year. This week of relaxation has been designated Spring Festival, a term that is sometimes used to refer to the Chinese New Year in general.\\nThe origins of the Chinese New Year are steeped in legend. One legend is that thousands of years ago a monster named Nian (“Year”) would attack villagers at the beginning of each new year. The monster was afraid of loud noises, bright lights, and the colour red, so those things were used to chase the beast away. Celebrations to usher out the old year and bring forth the luck and prosperity of the new one, therefore, often include firecrackers, fireworks, and red clothes and decorations. Young people are given money in colourful red envelopes. In addition, Chinese New Year is a time to feast and to visit family members. Many traditions of the season honour relatives who have died.\\nAmong other Chinese New Year traditions is the thorough cleaning of one’s home to rid the resident of any lingering bad luck. Some people prepare and enjoy special foods on certain days during the celebrations. The last event held during the Chinese New Year is called the Lantern Festival, during which people hang glowing lanterns in temples or carry them during a nighttime parade. Since the dragon is a Chinese symbol of good fortune, a dragon dance highlights festival celebrations in many areas. This procession involves a long, colourful dragon being carried through the streets by numerous dancers.'" + " document_id word score url\n", + "entity_type \n", + "LOC 5 5 5 5\n", + "MISC 55 55 55 55\n", + "ORG 6 6 6 6\n", + "PER 4 4 4 4" ] }, - "execution_count": 10, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "extracted_documents[0].content" + "df_entities[df_entities['score']>0.9].groupby(\"entity_type\").count()" ] }, {