diff --git a/docs/index.md b/docs/index.md index 6f56091..5e11145 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,5 +1,36 @@ # Impresso Python +
+ +
+ +Impresso is a library to interact with the [Impresso](https://impresso-project.ch/app) dataset. It provides a set of classes to interact with the API and a set of tools that make working with the data easier. + +## Installation and prerequisites + +The Impresso python library can be installed using `pip`: + +```shell +pip install impresso +``` + +The library requires Python version `3.10` as a minimum. It has a number of dependencies that are likely already present in a Jupyter environment like `matplotlib` and `pandas`. + +## Create a session + ::: impresso.connect -Hi there. \ No newline at end of file + +## About Impresso + +### Impresso project + +[Impresso - Media Monitoring of the Past](https://impresso-project.ch) is an interdisciplinary research project that aims to develop and consolidate tools for processing and exploring large collections of media archives across modalities, time, languages and national borders. The first project (2017-2021) was funded by the Swiss National Science Foundation under grant No. [CRSII5_173719](http://p3.snf.ch/project-173719) and the second project (2023-2027) by the SNSF under grant No. [CRSII5_213585](https://data.snf.ch/grants/grant/213585) and the Luxembourg National Research Fund under grant No. 17498891. + +### Copyright + +Copyright (C) 2024 The Impresso team. + +### License + +This program is provided as open source under the [GNU Affero General Public License](https://github.com/impresso/impresso-pyindexation/blob/master/LICENSE) v3 or later. diff --git a/docs/preparing_queries.md b/docs/preparing_queries.md new file mode 100644 index 0000000..3e6721b --- /dev/null +++ b/docs/preparing_queries.md @@ -0,0 +1,30 @@ +# Preparing queries + +Some filter parameters accept a combination of modifiers that can be used to create complex queries. For example, if we want to search for a term `Titanic` and `ship`, we can use the `AND` modifier to combine the two conditions: + +```python +from impresso import AND, OR + +impresso.search.find(term=AND("Titanic", "ship")) +``` + +We can refine this condition and search for all content items that mention `Titanic` and `ship` together **OR** mention `Titanic` and `iceberg` together **AND** do not mention `Di Caprio`. + + +```python +from impresso import AND, OR + +impresso.search.find( + term=( + AND("Titanic", "ship") | + AND("Titanic", "iceberg") + ) & ~OR("Di Caprio") +) +``` + +## Modifiers + +::: impresso.structures.OR +::: impresso.structures.AND +::: impresso.structures.DateRange +::: impresso.structures.NumericRange \ No newline at end of file diff --git a/docs/resources.md b/docs/resources.md index bffe386..df60048 100644 --- a/docs/resources.md +++ b/docs/resources.md @@ -8,4 +8,67 @@ Search content items in the Impresso corpus. impresso.search.find(term='Titanic', limit=10) ``` -::: impresso.resources.search.SearchResource \ No newline at end of file +::: impresso.resources.search.SearchResource + +::: impresso.api_client.models.search_order_by.SearchOrderByLiteral +::: impresso.resources.search.SearchDataContainer + +## Entities + +Search entities in the Impresso corpus. + +```python +impresso.entities.find(term="Douglas Adams") +``` + +::: impresso.resources.entities.EntitiesResource + +::: impresso.resources.entities.EntityType +::: impresso.api_client.models.find_entities_order_by.FindEntitiesOrderByLiteral + +## Newspapers + +Search newspapers available in the Impresso corpus. + +```python +impresso.newspapers.find( + term="wort", + order_by="lastIssue", +) +``` + +::: impresso.resources.newspapers.NewspapersResource + +::: impresso.api_client.models.find_newspapers_order_by.FindNewspapersOrderByLiteral +::: impresso.resources.newspapers.FindNewspapersContainer + +## Content Items + +Get a single content item by ID. + +```python +impresso.content_items.get("NZZ-1794-08-09-a-i0002") +``` + +## Collections + +Work with collections + +::: impresso.resources.collections.CollectionsResource + +::: impresso.api_client.models.find_collections_order_by.FindCollectionsOrderByLiteral +::: impresso.resources.collections.FindCollectionsContainer + +## Named entity recognition + +The python library contains a set of named entity recognition methods that use the same NER model used to add entities to the Impresso database. + +::: impresso.resources.tools.ToolsResource +::: impresso.resources.tools.NerContainer + +## Text reuse + +Two resources can be used to search text reuse clusters and passages. + +::: impresso.resources.text_reuse.clusters.TextReuseClustersResource +::: impresso.resources.text_reuse.passages.TextReusePassagesResource diff --git a/impresso/client.py b/impresso/client.py index 9aac292..124e742 100644 --- a/impresso/client.py +++ b/impresso/client.py @@ -82,6 +82,12 @@ def connect( """ Connect to the Impresso API and return a client object. + ```python + from impresso import connect + + impresso = connect() + ``` + Args: public_api_url (str): The URL of the Impresso API to connect to. By default using the default URL set in the config file (~/.impresso_py.yml) or the Impresso default URL ({DEFAULT_API_URL}). diff --git a/impresso/data_container.py b/impresso/data_container.py index 8085b55..b520d77 100644 --- a/impresso/data_container.py +++ b/impresso/data_container.py @@ -7,7 +7,11 @@ class DataContainer(Generic[IT, T]): - """Response of a resource call""" + """ + Generic container for responses from the Impresso API + returned by resource methods (`get`, `find`). + Generally represents a single page of the result. + """ def __init__( self, @@ -72,17 +76,17 @@ def _get_preview_image_(self) -> str | None: @property def raw(self) -> dict[str, Any]: - """Return the data as a python dictionary.""" + """Returns the response data as a python dictionary.""" return getattr(self._data, "to_dict")() @property def pydantic(self) -> T: - """Return the data as a pydantic model.""" + """Returns the response data as a pydantic model.""" return self._pydantic_model.model_validate(self.raw) @property def df(self) -> DataFrame: - """Return the data as a pandas dataframe.""" + """Returns the response data as a pandas dataframe.""" return DataFrame.from_dict(self._data) # type: ignore @property @@ -92,12 +96,12 @@ def total(self) -> int: @property def limit(self) -> int: - """Page size.""" + """Current page size.""" return self.raw.get("pagination", {}).get("limit", 0) @property def offset(self) -> int: - """Page offset.""" + """Current page offset.""" return self.raw.get("pagination", {}).get("offset", 0) @property @@ -107,5 +111,8 @@ def size(self) -> int: @property def url(self) -> str | None: - """A URL of the result set in the Impresso web app.""" + """ + URL of an Impresso web application page + representing the result set from this container. + """ return self._web_app_search_result_url diff --git a/impresso/resources/collections.py b/impresso/resources/collections.py index 9012df5..638a29e 100644 --- a/impresso/resources/collections.py +++ b/impresso/resources/collections.py @@ -64,7 +64,9 @@ def total(self) -> int: class CollectionsResource(Resource): - """Work with collections""" + """ + Work with collections. + """ name = "collections" @@ -75,7 +77,18 @@ def find( limit: int | None = None, offset: int | None = None, ) -> FindCollectionsContainer: - """Find collections.""" + """ + Search collections in Impresso. + + Args: + term: Search term. + order_by: Order by aspect. + limit: Number of results to return. + offset: Number of results to skip. + + Returns: + FindCollectionsContainer: Data container with a page of results of the search. + """ result = find_collections.sync( client=self._api_client, @@ -122,7 +135,18 @@ def items( limit: int | None = None, offset: int | None = None, ) -> SearchDataContainer: - """Return all items in a collection.""" + """ + Return all content items from a collection. + + Args: + collection_id: ID of the collection. + limit: Number of results to return. + offset: Number of results to skip. + + Returns: + SearchDataContainer: Data container with a page of results of the search. + """ + search_resource = SearchResource(self._api_client) return search_resource.find( collection_id=collection_id, limit=limit, offset=offset @@ -135,6 +159,10 @@ def add_items(self, collection_id: str, item_ids: list[str]) -> None: **NOTE**: Items are not added immediately. This operation may take up to a few minutes to complete and reflect in the collection. + + Args: + collection_id: ID of the collection. + item_ids: IDs of the content items to add. """ result = patch_collections_collection_id_items.sync( client=self._api_client, @@ -148,11 +176,15 @@ def add_items(self, collection_id: str, item_ids: list[str]) -> None: def remove_items(self, collection_id: str, item_ids: list[str]) -> None: """ - Remove items from a collection by their IDs. + Add items to a collection by their IDs. - **NOTE**: Items are not added immediately. + **NOTE**: Items are not removed immediately. This operation may take up to a few minutes to complete and reflect in the collection. + + Args: + collection_id: ID of the collection. + item_ids: IDs of the content items to add. """ result = patch_collections_collection_id_items.sync( client=self._api_client, diff --git a/impresso/resources/entities.py b/impresso/resources/entities.py index 6a9536b..3f66609 100644 --- a/impresso/resources/entities.py +++ b/impresso/resources/entities.py @@ -51,7 +51,7 @@ def df(self) -> DataFrame: class EntitiesResource(Resource): - """Work with entities""" + """Search entities in the Impresso database.""" name = "entities" @@ -66,7 +66,22 @@ def find( limit: int | None = None, offset: int | None = None, ) -> FindEntitiesContainer: - """Find entities.""" + """ + Search entities in Impresso. + + Args: + term: Search term. + wikidata_id: Return only entities resolved to this Wikidata ID. + entity_id: Return only entity with this ID. + entity_type: Return only entities of this type. + order_by: Field to order results by. + resolve: Return Wikidata details of the entities, if the entity is linked to a Wikidata entry. + limit: Number of results to return. + offset: Number of results to skip. + + Returns: + FindEntitiesContainer: Data container with a page of results of the search. + """ filters: list[Filter] = [] if entity_type is not None: diff --git a/impresso/resources/newspapers.py b/impresso/resources/newspapers.py index 0ba5831..1e1a123 100644 --- a/impresso/resources/newspapers.py +++ b/impresso/resources/newspapers.py @@ -32,7 +32,7 @@ def df(self) -> DataFrame: class NewspapersResource(Resource): - """Search newspapers""" + """Search newspapers in the Impresso database.""" name = "newspapers" @@ -43,7 +43,18 @@ def find( limit: int | None = None, offset: int | None = None, ) -> FindNewspapersContainer: + """ + Search newspapers in Impresso. + Args: + term: Search term. + order_by: Field to order results by. + limit: Number of results to return. + offset: Number of results to skip. + + Returns: + FindNewspapersContainer: Data container with a page of results of the search. + """ result = find_newspapers.sync( client=self._api_client, term=term if term is not None else UNSET, diff --git a/impresso/resources/search.py b/impresso/resources/search.py index 18727b4..6b011e0 100644 --- a/impresso/resources/search.py +++ b/impresso/resources/search.py @@ -55,55 +55,6 @@ def df(self) -> DataFrame: class FacetDataContainer(DataContainer): """Response of a get facet call.""" - # def __init__( - # self, - # data: IT, - # pydantic_model: type[T], - # limit: int | None, - # offset: int | None, - # web_app_search_result_url: str, - # ): - # super().__init__(data, pydantic_model, web_app_search_result_url) - # self._limit = limit - # self._offset = offset - - # @property - # def raw(self) -> dict[str, Any]: - # """Return the data as a python dictionary.""" - # return self._data.to_dict() - - # @property - # def pydantic(self) -> list[SearchFacetBucket]: - # """Return the data as a pydantic model.""" - # return self._pydantic_model.model_validate(self.raw) - - # @property - # def df(self) -> DataFrame: - # """Return the data as a pandas dataframe.""" - # if len(self.raw["buckets"]) == 0: - # return DataFrame() - # return json_normalize(self.raw["buckets"]).set_index("val") - - # @property - # def size(self) -> int: - # """Current page size.""" - # return len(self.raw.get("buckets", [])) - - # @property - # def total(self) -> int: - # """Total number of results.""" - # return self.raw.get("numBuckets", 0) - - # @property - # def limit(self) -> int: - # """Page size.""" - # return self._limit or len(self.raw["buckets"]) - - # @property - # def offset(self) -> int: - # """Page offset.""" - # return self._offset or 0 - @property def df(self) -> DataFrame: """Return the data as a pandas dataframe.""" @@ -170,7 +121,7 @@ def find( or all/any of the clusters. Returns: - _type_: _description_ + SearchDataContainer: Data container with a page of results of the search. """ filters = self._build_filters( diff --git a/impresso/resources/text_reuse/clusters.py b/impresso/resources/text_reuse/clusters.py index 669f452..97453dd 100644 --- a/impresso/resources/text_reuse/clusters.py +++ b/impresso/resources/text_reuse/clusters.py @@ -46,38 +46,6 @@ def df(self) -> DataFrame: return json_normalize(data).set_index("uid") return DataFrame() - # @property - # def pydantic(self): - # """Return the data as a pydantic model.""" - # remapped_raw = { - # "data": self.raw.get("clusters", []), - # "info": self.raw.get("info", {}), - # "total": self.total, - # "limit": self.limit, - # "offset": self.offset, - # } - # return self._pydantic_model.model_validate(remapped_raw) - - # @property - # def size(self) -> int: - # """Current page size.""" - # return len(self.raw.get("clusters", [])) - - # @property - # def total(self) -> int: - # """Total number of results.""" - # return self.raw.get("info", {}).get("total", 0) - - # @property - # def limit(self) -> int: - # """Page size.""" - # return self.raw.get("info", {}).get("limit", 0) - - # @property - # def offset(self) -> int: - # """Page offset.""" - # return self.raw.get("info", {}).get("offset", 0) - Range = tuple[int, int] diff --git a/mkdocs.yml b/mkdocs.yml index 0efc563..6456e2c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,4 +28,5 @@ markdown_extensions: nav: - Home: index.md - Result Object: result.md - - Resources: resources.md \ No newline at end of file + - Resources: resources.md + - Preparing Queries: preparing_queries.md \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 68cea79..fcf1095 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ pandas = "^2.1.0" pandas-stubs = "^2.2.1.240316" protobuf = "^4.25.0" pydantic = "^2.6.4" -python = "^3.10.0 || ^3.11.0" +python = "^3.10.0 || ^3.11.0 || ^3.12.0" python-dateutil = "^2.8.0" types-PyYAML = "^6.0.12.20240311" types-protobuf = "^5.27.0.20240626"