From de90640cb4949cc98d4e973e5cbf7d53e755f5e0 Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Thu, 24 Oct 2024 12:00:55 +0200 Subject: [PATCH] updated ner/nel (#12) --- examples/notebooks/tools.ipynb | 281 ++++++++++++++++-- ...mpresso_named_entity_recognition_entity.py | 35 ++- ...ed_entity_recognition_entity_confidence.py | 14 +- ...so_named_entity_recognition_entity_type.py | 50 ++++ ...amed_entity_recognition_entity_wikidata.py | 9 + impresso/api_models.py | 37 ++- pyproject.toml | 2 +- 7 files changed, 377 insertions(+), 51 deletions(-) diff --git a/examples/notebooks/tools.ipynb b/examples/notebooks/tools.ipynb index 9570b34..0fc9c9b 100644 --- a/examples/notebooks/tools.ipynb +++ b/examples/notebooks/tools.ipynb @@ -2,17 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🎉 You are now connected to the Impresso API! 🎉\n" - ] - } - ], + "outputs": [], "source": [ "from impresso import connect\n", "\n", @@ -34,9 +26,13 @@ { "data": { "text/html": [ + "
\n", + "
\n", "

Ner result

\n", - "
Contains 19 items of 19 total items.
\n", + "
Contains 9 items of 9 total items.
\n", "
\n", + "
\n", + "
\n", "

Data preview:

\n", "
\n", "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
typesurfaceFormfunctionnameconfidence.nerconfidence.neloffset.startoffset.endwikidata.idwikidata.wikipediaPageNamewikidata.wikipediaPageUrl
id
1:37:pers:ner-stacked-2-bert-medium-historic-multilingual|ner-mgenre-multilingualpersJean-Baptiste Nicolas Robert SchumanN/ABaptiste Nicolas Robert Schuman93.8199.57137Q15981Robert Schumanhttps://en.wikipedia.org/wiki/Robert_Schuman
41:53:time:ner-stacked-2-bert-medium-historic-multilingual|ner-mgenre-multilingualtime29 June 1886N/AN/A86.4993.574153Q15981Robert Schumanhttps://en.wikipedia.org/wiki/Robert_Schuman
56:72:time:ner-stacked-2-bert-medium-historic-multilingual|ner-mgenre-multilingualtime4 September 1963N/AN/A74.5381.875672NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = \"\"\"\n", + "Jean-Baptiste Nicolas Robert Schuman ( \n", + "29 June 1886 – 4 September 1963) was a Luxembourg-born French \n", + "statesman. Schuman was a Christian democratic (Popular \n", + "Republican Movement) political thinker and activist. \n", + "\"\"\"\n", + "result = impresso.tools.ner_nel(\n", + " text=text,\n", + ")\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "

Ner result

\n", + "
Contains 1 items of 1 total items.
\n", + "
\n", + "
\n", + "
\n", + "

Data preview:

\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
typesurfaceFormconfidence.neloffset.startoffset.endwikidata.idwikidata.wikipediaPageNamewikidata.wikipediaPageUrl
id
8:44:UNK:nel-mgenre-multilingualunkJean-Baptiste Nicolas Robert Schuman99.94844Q15981Robert Schumanhttps://en.wikipedia.org/wiki/Robert_Schuman
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = \"\"\"\n", + "[START]Jean-Baptiste Nicolas Robert Schuman[END] ( \n", + "29 June 1886 – 4 September 1963) was a Luxembourg-born French \n", + "statesman. Schuman was a Christian democratic (Popular \n", + "Republican Movement) political thinker and activist. \n", + "\"\"\"\n", + "result = impresso.tools.nel(\n", + " text=text,\n", + ")\n", + "result" + ] } ], "metadata": { diff --git a/impresso/api_client/models/impresso_named_entity_recognition_entity.py b/impresso/api_client/models/impresso_named_entity_recognition_entity.py index 21b7117..9c6338c 100644 --- a/impresso/api_client/models/impresso_named_entity_recognition_entity.py +++ b/impresso/api_client/models/impresso_named_entity_recognition_entity.py @@ -23,9 +23,9 @@ class ImpressoNamedEntityRecognitionEntity: Attributes: id (str): ID of the entity type (ImpressoNamedEntityRecognitionEntityType): Type of the entity - surface_form (str): Surface form of the entity - offset (ImpressoNamedEntityRecognitionEntityOffset): confidence (ImpressoNamedEntityRecognitionEntityConfidence): + surface_form (Union[Unset, str]): Surface form of the entity + offset (Union[Unset, ImpressoNamedEntityRecognitionEntityOffset]): is_type_nested (Union[Unset, bool]): Whether the entity type is nested wikidata (Union[Unset, ImpressoNamedEntityRecognitionEntityWikidata]): function (Union[Unset, str]): Function of the entity @@ -34,9 +34,9 @@ class ImpressoNamedEntityRecognitionEntity: id: str type: ImpressoNamedEntityRecognitionEntityType - surface_form: str - offset: "ImpressoNamedEntityRecognitionEntityOffset" confidence: "ImpressoNamedEntityRecognitionEntityConfidence" + surface_form: Union[Unset, str] = UNSET + offset: Union[Unset, "ImpressoNamedEntityRecognitionEntityOffset"] = UNSET is_type_nested: Union[Unset, bool] = UNSET wikidata: Union[Unset, "ImpressoNamedEntityRecognitionEntityWikidata"] = UNSET function: Union[Unset, str] = UNSET @@ -47,11 +47,13 @@ def to_dict(self) -> Dict[str, Any]: type = self.type.value + confidence = self.confidence.to_dict() + surface_form = self.surface_form - offset = self.offset.to_dict() - - confidence = self.confidence.to_dict() + offset: Union[Unset, Dict[str, Any]] = UNSET + if not isinstance(self.offset, Unset): + offset = self.offset.to_dict() is_type_nested = self.is_type_nested @@ -68,11 +70,13 @@ def to_dict(self) -> Dict[str, Any]: { "id": id, "type": type, - "surfaceForm": surface_form, - "offset": offset, "confidence": confidence, } ) + if surface_form is not UNSET: + field_dict["surfaceForm"] = surface_form + if offset is not UNSET: + field_dict["offset"] = offset if is_type_nested is not UNSET: field_dict["isTypeNested"] = is_type_nested if wikidata is not UNSET: @@ -99,11 +103,16 @@ def from_dict(cls: Type[T], src_dict: Dict[str, Any]) -> T: type = ImpressoNamedEntityRecognitionEntityType(d.pop("type")) - surface_form = d.pop("surfaceForm") + confidence = ImpressoNamedEntityRecognitionEntityConfidence.from_dict(d.pop("confidence")) - offset = ImpressoNamedEntityRecognitionEntityOffset.from_dict(d.pop("offset")) + surface_form = d.pop("surfaceForm", UNSET) - confidence = ImpressoNamedEntityRecognitionEntityConfidence.from_dict(d.pop("confidence")) + _offset = d.pop("offset", UNSET) + offset: Union[Unset, ImpressoNamedEntityRecognitionEntityOffset] + if isinstance(_offset, Unset): + offset = UNSET + else: + offset = ImpressoNamedEntityRecognitionEntityOffset.from_dict(_offset) is_type_nested = d.pop("isTypeNested", UNSET) @@ -121,9 +130,9 @@ def from_dict(cls: Type[T], src_dict: Dict[str, Any]) -> T: impresso_named_entity_recognition_entity = cls( id=id, type=type, + confidence=confidence, surface_form=surface_form, offset=offset, - confidence=confidence, is_type_nested=is_type_nested, wikidata=wikidata, function=function, diff --git a/impresso/api_client/models/impresso_named_entity_recognition_entity_confidence.py b/impresso/api_client/models/impresso_named_entity_recognition_entity_confidence.py index 2602680..fba3d88 100644 --- a/impresso/api_client/models/impresso_named_entity_recognition_entity_confidence.py +++ b/impresso/api_client/models/impresso_named_entity_recognition_entity_confidence.py @@ -11,11 +11,11 @@ class ImpressoNamedEntityRecognitionEntityConfidence: """ Attributes: - ner (float): Confidence score for the named entity recognition + ner (Union[Unset, float]): Confidence score for the named entity recognition nel (Union[Unset, float]): Confidence score for the named entity linking """ - ner: float + ner: Union[Unset, float] = UNSET nel: Union[Unset, float] = UNSET def to_dict(self) -> Dict[str, Any]: @@ -24,11 +24,9 @@ def to_dict(self) -> Dict[str, Any]: nel = self.nel field_dict: Dict[str, Any] = {} - field_dict.update( - { - "ner": ner, - } - ) + field_dict.update({}) + if ner is not UNSET: + field_dict["ner"] = ner if nel is not UNSET: field_dict["nel"] = nel @@ -37,7 +35,7 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls: Type[T], src_dict: Dict[str, Any]) -> T: d = src_dict.copy() - ner = d.pop("ner") + ner = d.pop("ner", UNSET) nel = d.pop("nel", UNSET) diff --git a/impresso/api_client/models/impresso_named_entity_recognition_entity_type.py b/impresso/api_client/models/impresso_named_entity_recognition_entity_type.py index 5d6edbc..da73a3a 100644 --- a/impresso/api_client/models/impresso_named_entity_recognition_entity_type.py +++ b/impresso/api_client/models/impresso_named_entity_recognition_entity_type.py @@ -25,6 +25,30 @@ class ImpressoNamedEntityRecognitionEntityType(str, Enum): ORG_ADM = "org.adm" ORG_ENT = "org.ent" ORG_ENT_PRESSAGENCY = "org.ent.pressagency" + ORG_ENT_PRESSAGENCY_AFP = "org.ent.pressagency.AFP" + ORG_ENT_PRESSAGENCY_AG = "org.ent.pressagency.ag" + ORG_ENT_PRESSAGENCY_ANSA = "org.ent.pressagency.ANSA" + ORG_ENT_PRESSAGENCY_AP = "org.ent.pressagency.AP" + ORG_ENT_PRESSAGENCY_APA = "org.ent.pressagency.APA" + ORG_ENT_PRESSAGENCY_ATS_SDA = "org.ent.pressagency.ATS-SDA" + ORG_ENT_PRESSAGENCY_BELGA = "org.ent.pressagency.Belga" + ORG_ENT_PRESSAGENCY_CTK = "org.ent.pressagency.CTK" + ORG_ENT_PRESSAGENCY_DDP_DAPD = "org.ent.pressagency.DDP-DAPD" + ORG_ENT_PRESSAGENCY_DNB = "org.ent.pressagency.DNB" + ORG_ENT_PRESSAGENCY_DOMEI = "org.ent.pressagency.Domei" + ORG_ENT_PRESSAGENCY_DPA = "org.ent.pressagency.DPA" + ORG_ENT_PRESSAGENCY_EUROPAPRESS = "org.ent.pressagency.Europapress" + ORG_ENT_PRESSAGENCY_EXTEL = "org.ent.pressagency.Extel" + ORG_ENT_PRESSAGENCY_HAVAS = "org.ent.pressagency.Havas" + ORG_ENT_PRESSAGENCY_KIPA = "org.ent.pressagency.Kipa" + ORG_ENT_PRESSAGENCY_REUTERS = "org.ent.pressagency.Reuters" + ORG_ENT_PRESSAGENCY_SPK_SMP = "org.ent.pressagency.SPK-SMP" + ORG_ENT_PRESSAGENCY_STEFANI = "org.ent.pressagency.Stefani" + ORG_ENT_PRESSAGENCY_TASS = "org.ent.pressagency.TASS" + ORG_ENT_PRESSAGENCY_UNK = "org.ent.pressagency.unk" + ORG_ENT_PRESSAGENCY_UP_UPI = "org.ent.pressagency.UP-UPI" + ORG_ENT_PRESSAGENCY_WOLFF = "org.ent.pressagency.Wolff" + ORG_ENT_PRESSAGENCY_XINHUA = "org.ent.pressagency.Xinhua" PERS = "pers" PERS_COLL = "pers.coll" PERS_IND = "pers.ind" @@ -35,6 +59,7 @@ class ImpressoNamedEntityRecognitionEntityType(str, Enum): TIME = "time" TIME_DATE_ABS = "time.date.abs" TIME_HOUR_ABS = "time.hour.abs" + UNK = "unk" def __str__(self) -> str: return str(self.value) @@ -63,6 +88,30 @@ def __str__(self) -> str: "org.adm", "org.ent", "org.ent.pressagency", + "org.ent.pressagency.AFP", + "org.ent.pressagency.ag", + "org.ent.pressagency.ANSA", + "org.ent.pressagency.AP", + "org.ent.pressagency.APA", + "org.ent.pressagency.ATS-SDA", + "org.ent.pressagency.Belga", + "org.ent.pressagency.CTK", + "org.ent.pressagency.DDP-DAPD", + "org.ent.pressagency.DNB", + "org.ent.pressagency.Domei", + "org.ent.pressagency.DPA", + "org.ent.pressagency.Europapress", + "org.ent.pressagency.Extel", + "org.ent.pressagency.Havas", + "org.ent.pressagency.Kipa", + "org.ent.pressagency.Reuters", + "org.ent.pressagency.SPK-SMP", + "org.ent.pressagency.Stefani", + "org.ent.pressagency.TASS", + "org.ent.pressagency.unk", + "org.ent.pressagency.UP-UPI", + "org.ent.pressagency.Wolff", + "org.ent.pressagency.Xinhua", "pers", "pers.coll", "pers.ind", @@ -73,4 +122,5 @@ def __str__(self) -> str: "time", "time.date.abs", "time.hour.abs", + "unk", ] diff --git a/impresso/api_client/models/impresso_named_entity_recognition_entity_wikidata.py b/impresso/api_client/models/impresso_named_entity_recognition_entity_wikidata.py index 40d2636..3b42079 100644 --- a/impresso/api_client/models/impresso_named_entity_recognition_entity_wikidata.py +++ b/impresso/api_client/models/impresso_named_entity_recognition_entity_wikidata.py @@ -13,16 +13,20 @@ class ImpressoNamedEntityRecognitionEntityWikidata: Attributes: id (str): Wikidata ID of the entity wikipedia_page_name (Union[Unset, str]): Wikipedia page name of the entity + wikipedia_page_url (Union[Unset, str]): Wikipedia page URL of the entity """ id: str wikipedia_page_name: Union[Unset, str] = UNSET + wikipedia_page_url: Union[Unset, str] = UNSET def to_dict(self) -> Dict[str, Any]: id = self.id wikipedia_page_name = self.wikipedia_page_name + wikipedia_page_url = self.wikipedia_page_url + field_dict: Dict[str, Any] = {} field_dict.update( { @@ -31,6 +35,8 @@ def to_dict(self) -> Dict[str, Any]: ) if wikipedia_page_name is not UNSET: field_dict["wikipediaPageName"] = wikipedia_page_name + if wikipedia_page_url is not UNSET: + field_dict["wikipediaPageUrl"] = wikipedia_page_url return field_dict @@ -41,9 +47,12 @@ def from_dict(cls: Type[T], src_dict: Dict[str, Any]) -> T: wikipedia_page_name = d.pop("wikipediaPageName", UNSET) + wikipedia_page_url = d.pop("wikipediaPageUrl", UNSET) + impresso_named_entity_recognition_entity_wikidata = cls( id=id, wikipedia_page_name=wikipedia_page_name, + wikipedia_page_url=wikipedia_page_url, ) return impresso_named_entity_recognition_entity_wikidata diff --git a/impresso/api_models.py b/impresso/api_models.py index 0dd2015..67a1992 100644 --- a/impresso/api_models.py +++ b/impresso/api_models.py @@ -244,7 +244,8 @@ class Confidence(BaseModel): extra='forbid', ) ner: Annotated[ - float, Field(description='Confidence score for the named entity recognition') + Optional[float], + Field(None, description='Confidence score for the named entity recognition'), ] nel: Annotated[ Optional[float], @@ -260,6 +261,9 @@ class Wikidata(BaseModel): wikipediaPageName: Annotated[ Optional[str], Field(None, description='Wikipedia page name of the entity') ] + wikipediaPageUrl: Annotated[ + Optional[str], Field(None, description='Wikipedia page URL of the entity') + ] class ImpressoNerEntity(BaseModel): @@ -291,6 +295,30 @@ class ImpressoNerEntity(BaseModel): 'org.adm', 'org.ent', 'org.ent.pressagency', + 'org.ent.pressagency.AFP', + 'org.ent.pressagency.ANSA', + 'org.ent.pressagency.AP', + 'org.ent.pressagency.APA', + 'org.ent.pressagency.ATS-SDA', + 'org.ent.pressagency.Belga', + 'org.ent.pressagency.CTK', + 'org.ent.pressagency.DDP-DAPD', + 'org.ent.pressagency.DNB', + 'org.ent.pressagency.DPA', + 'org.ent.pressagency.Domei', + 'org.ent.pressagency.Europapress', + 'org.ent.pressagency.Extel', + 'org.ent.pressagency.Havas', + 'org.ent.pressagency.Kipa', + 'org.ent.pressagency.Reuters', + 'org.ent.pressagency.SPK-SMP', + 'org.ent.pressagency.Stefani', + 'org.ent.pressagency.TASS', + 'org.ent.pressagency.UP-UPI', + 'org.ent.pressagency.Wolff', + 'org.ent.pressagency.Xinhua', + 'org.ent.pressagency.ag', + 'org.ent.pressagency.unk', 'pers', 'pers.coll', 'pers.ind', @@ -301,11 +329,14 @@ class ImpressoNerEntity(BaseModel): 'time', 'time.date.abs', 'time.hour.abs', + 'unk', ], Field(description='Type of the entity'), ] - surfaceForm: Annotated[str, Field(description='Surface form of the entity')] - offset: Offset + surfaceForm: Annotated[ + Optional[str], Field(None, description='Surface form of the entity') + ] + offset: Optional[Offset] = None isTypeNested: Annotated[ Optional[bool], Field(None, description='Whether the entity type is nested') ] diff --git a/pyproject.toml b/pyproject.toml index a1b38ac..3e0be2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ packages = [ ] readme = "README.md" repository = "https://github.com/impresso/impresso-py" -version = "0.9.8" +version = "0.9.9" [tool.poetry.urls] Endpoint = "https://impresso-project.ch/public-api"