Skip to content

Commit

Permalink
Merge pull request #29 from CentreForDigitalHumanities/feature/fetch-…
Browse files Browse the repository at this point in the history
…by-range

Allow fetching of specific ranges
  • Loading branch information
tijmenbaarda authored Apr 24, 2024
2 parents ac01783 + 4e7e610 commit ce07fba
Show file tree
Hide file tree
Showing 14 changed files with 339 additions and 147 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,6 @@ dmypy.json

# Pyre type checker
.pyre/

.vscode/
.idea/
4 changes: 2 additions & 2 deletions edpop_explorer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
'EDPOPREC', 'RELATORS', 'bind_common_namespaces',
'Field', 'FieldError', 'LocationField',
'Reader', 'ReaderError', 'NotFoundError',
'GetByIdBasedOnQueryMixin', 'PreparedQuery', 'PreparedQueryType',
'GetByIdBasedOnQueryMixin', 'BasePreparedQuery', 'PreparedQueryType',
'Record', 'RawData', 'RecordError', 'BibliographicalRecord',
'BiographicalRecord', 'LazyRecordMixin',
'SRUReader',
Expand All @@ -19,7 +19,7 @@
from .rdf import EDPOPREC, RELATORS, bind_common_namespaces
from .fields import Field, FieldError, LocationField
from .reader import (
Reader, ReaderError, GetByIdBasedOnQueryMixin, PreparedQuery,
Reader, ReaderError, GetByIdBasedOnQueryMixin, BasePreparedQuery,
PreparedQueryType, NotFoundError
)
from .record import (
Expand Down
222 changes: 155 additions & 67 deletions edpop_explorer/reader.py
Original file line number Diff line number Diff line change
@@ -1,127 +1,175 @@
"""Base reader class and strongly related functionality."""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional, List, Union
from typing import Optional, Union, Dict
from rdflib import Graph, RDF, URIRef
from urllib.parse import quote, unquote


from edpop_explorer import (
EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces
)
from .record import Record


@dataclass
class PreparedQuery:
class BasePreparedQuery:
"""Empty base dataclass for prepared queries. For prepared queries that
can be represented by a single string, do not inherit from this class
but use a simple string instead."""
pass


PreparedQueryType = Union[str, PreparedQuery]
PreparedQueryType = Union[str, BasePreparedQuery]


class Reader(ABC):
'''Base reader class (abstract).
"""Base reader class (abstract).
This abstract base class provides a common interface for all readers.
To use, instantiate a subclass, set a query using the
To use, instantiate a subclass, set a query using the
``prepare_query()`` or ``set_query()`` method, call ``fetch()``
and subsequently ``fetch_next()`` until you have the number
of results that you want. The attributes ``number_of_results``,
``number_fetched`` and ``records`` will be updated after
``number_fetched`` and ``records`` will be updated after
fetching.
To create a concrete reader, make a subclass that implements the
``fetch()``, ``fetch_next()`` and ``transform_query()`` methods
To create a concrete reader, make a subclass that implements the
``fetch_range()`` and ``transform_query()`` methods
and set the ``READERTYPE`` and ``CATALOG_URIREF`` attributes.
``fetch()`` and ``fetch_next()`` should populate the
``records``, ``number_of_results`` and ``number_fetched``
attributes.
'''
``fetch_range()`` should populate the ``records``, ``number_of_results``,
``number_fetched`` and ``range_fetched`` attributes.
"""
number_of_results: Optional[int] = None
'''The total number of results for the query, including those
that have not been fetched yet.'''
number_fetched: int = 0
'''The number of results that has been fetched so far, or 0 if
no fetch has been performed yet.'''
records: List[Optional[Record]]
'''The records that have been fetched as instances of
(a subclass of) ``Record``.'''
"""The total number of results for the query, or None if fetching
has not yet started and the number is not yet known."""
records: Dict[int, Record]
"""The records that have been fetched as instances of
(a subclass of) ``Record``."""
prepared_query: Optional[PreparedQueryType] = None
'''A transformed version of the query, available after
calling ``prepare_query()`` or ``set_query``.'''
"""A transformed version of the query, available after
calling ``prepare_query()`` or ``set_query``."""
READERTYPE: Optional[str] = None
'''The type of the reader, out of ``BIOGRAPHICAL`` and
``BIBLIOGRAPHICAL`` (defined in the ``edpop_explorer`` package).'''
"""The type of the reader, out of ``BIOGRAPHICAL`` and
``BIBLIOGRAPHICAL`` (defined in the ``edpop_explorer`` package)."""
CATALOG_URIREF: Optional[URIRef] = None
IRI_PREFIX: Optional[str] = None
'''The prefix to use to create an IRI out of a record identifier.
"""The prefix to use to create an IRI out of a record identifier.
If an IRI cannot be created with a simple prefix, the
`identifier_to_iri` and `iri_to_identifier` methods have to be
overridden.'''
_graph: Optional[Graph] = None
overridden."""
FETCH_ALL_AT_ONCE = False
"""True if the reader is configured to fetch all records at once,
even if the user only needs a subset."""
DEFAULT_RECORDS_PER_PAGE: int = 10
"""The number of records to fetch at a time using the ``fetch()``
method if not determined by user."""
_fetch_position: int = 0
"""The index of the record that was fetched last. This is used by
the ``fetch()`` method to decide where to continue fetching."""

def __init__(self):
self.records = []
self.records = {}

@classmethod
@abstractmethod
def transform_query(cls, query: str) -> PreparedQueryType:
'''Return a version of the query that is prepared for use in the
"""Return a version of the query that is prepared for use in the
API.
This method does not have to be called directly; instead
``prepare_query()`` can be used.'''
This method does not have to be called directly; instead
``prepare_query()`` can be used."""
pass

def prepare_query(self, query: str) -> None:
'''Prepare a query for use by the reader's API. Updates the
``prepared_query`` attribute.'''
"""Prepare a query for use by the reader's API. Updates the
``prepared_query`` attribute."""
self.prepared_query = self.transform_query(query)

def set_query(self, query: PreparedQueryType) -> None:
'''Set an exact query. Updates the ``prepared_query``
attribute.'''
"""Set an exact query. Updates the ``prepared_query``
attribute."""
self.prepared_query = query

def adjust_start_record(self, start_number: int) -> None:
"""Skip the given number of first records and start fetching
afterwards. Should be calling before the first time calling
``fetch()``. The missing records in the ``records`` attribute
will be filled by ``None``s. The ``number_fetched`` attribute
will be adjusted as if the first records have been fetched.
This is mainly useful if the skipped records have already been
fetched but the original ``Reader`` object is not available anymore.
This functionality may be ignored by readers that can only load
all records at once; generally these are readers that return lazy
"""Skip the given number of first records and start fetching
afterwards.
This functionality may be ignored by readers that can only load
all records at once; generally these are readers that return lazy
records."""
if self.number_of_results is not None:
raise ReaderError(
"adjust_start_record should not be called after fetching."
)
self.number_fetched = start_number
self.records = [None for _ in range(start_number)]
self._fetch_position = start_number

@abstractmethod
def fetch(
self, number: Optional[int] = None
):
'''Perform an initial or subsequent query. Most readers fetch
) -> range:
"""Perform an initial or subsequent query. Most readers fetch
a limited number of records at once -- this number depends on
the reader but it may be adjusted using the ``number`` argument.
the reader but it may be adjusted using the ``number`` parameter.
Other readers fetch all records at once and ignore the ``number``
argument. After fetching, the ``records`` and ``number_fetched``
attributes are adjusted and the ``number_of_results`` attribute
will be available.'''
parameter. After fetching, the records are available in the
``records`` attribute and the ``number_of_results`` attribute
will be available. Returns the range of record indexes that has
been fetched."""
if self.fetching_exhausted:
return range(0)
if number is None:
number = self.DEFAULT_RECORDS_PER_PAGE
resulting_range = self.fetch_range(range(self._fetch_position,
self._fetch_position + number))
self._fetch_position = resulting_range.stop
return resulting_range

@abstractmethod
def fetch_range(self, range_to_fetch: range) -> range:
"""Fetch a specific range of records. After fetching, the records
are available in the ``records`` attribute and the
``number_of_results`` attribute will be available. If not all records
of the specified range exist, only the records that exist will be
fetched.
:param range_to_fetch: The range of records to fetch. ``step`` values
of ranges other than 1 are not supported and may be ignored.
:returns: The range of record indexes that has actually been fetched.
"""
pass

def get(self, index: int, allow_fetching: bool = True) -> Record:
"""Get a record with a specific index. If the record is not yet
available, fetch additional records to make it available.
:param index: The number of the record to get.
:param allow_fetching: Allow fetching the record from an external
source if it was not yet fetched.
"""
try:
return self.records[index]
except KeyError:
record = None
# Try to fetch, if it is allowed, and if there is a chance that
# it is successful (by verifying that index is not out of
# available range, if known)
if (allow_fetching and
(self.number_of_results is None
or self.number_of_results <= index)):
# Fetch and try again
self.fetch_range(range(index, index + 1))
record = self.records.get(index)
if record is not None:
return record
else:
raise NotFoundError(f"Item with index {index} is not available.")

@classmethod
@abstractmethod
def get_by_id(cls, identifier: str) -> Record:
'''Get a single record by its identifier.'''
"""Get a single record by its identifier."""
pass

@classmethod
def get_by_iri(cls, iri: str) -> Record:
'''Get a single records by its IRI.'''
"""Get a single records by its IRI."""
identifier = cls.iri_to_identifier(iri)
return cls.get_by_id(identifier)

Expand Down Expand Up @@ -170,18 +218,55 @@ def catalog_to_graph(cls) -> Graph:

# Set namespace prefixes
bind_common_namespaces(g)

return g

@property
def fetching_exhausted(self) -> bool:
"""Return ``True`` if all results have been fetched. This is currently
implemented by simply checking if the ``number_of_results`` and
``number_fetched`` attributes are equal."""
return self.number_fetched == self.number_of_results
"""Return ``True`` if all results have been fetched."""
return self.fetching_started and self.number_of_results == self.number_fetched

@property
def fetching_started(self) -> bool:
"""``True`` if fetching has started, otherwise ``False``. As soon
as fetching has started, changing the query is not possible anymore."""
return self.number_of_results is not None

@property
def number_fetched(self) -> int:
"""The number of results that has been fetched so far, or 0 if
no fetch has been performed yet."""
return len(self.records)

def generate_identifier(self) -> str:
"""Generate an identifier for this reader that is unique for the
combination of reader type and prepared query. This identifier can
be used when the reader has to be reused across sessions by
pickling and unpickling.
Note: while the identifier is guaranteed to be unique, there
is no guarantee that the generated identifier is the same for
every combination of reader type and prepared query."""
if self.prepared_query is None:
raise RuntimeError("A prepared query should be set first")
# Create identifier based on reader class name and prepared query.
readertype = self.__class__
# self.prepared_query is either a string or a dataclass instance,
# which means that it has a __str__ method that gives a unique
# string representation of its contents (at least as long as
# it does not contain a very complex data structure, which should
# not be the case). For dataclasses, it is not guaranteed
prepared_query = str(self.prepared_query)
return f"{readertype} | {prepared_query}"


class GetByIdBasedOnQueryMixin(ABC):
"""Mixin for readers that are based on an API that has no special
way of retrieving single records -- instead, these readers fetch
single records using a list query. To use, make sure to override
the ``_prepare_get_by_id_query`` method, which defines the list
query that should be used."""

@classmethod
def get_by_id(cls, identifier: str) -> Record:
reader = cls()
Expand All @@ -190,13 +275,14 @@ def get_by_id(cls, identifier: str) -> Record:
reader.set_query(cls._prepare_get_by_id_query(identifier))
reader.fetch()
if reader.number_of_results == 0:
raise ReaderError("No results returned")
for record in reader.records:
raise NotFoundError("No results returned")
for record in reader.records.values():
assert record is not None
if record.identifier == identifier:
return record
# Record with correct ID was not returned in first fetch -
# give up.
raise ReaderError(
raise NotFoundError(
f"Record with identifier {identifier} not present among "
f"{reader.number_of_results} returned results."
)
Expand All @@ -208,6 +294,8 @@ def _prepare_get_by_id_query(cls, identifier: str) -> PreparedQueryType:


class ReaderError(Exception):
"""Generic exception for failures in ``Reader`` class. More specific errors
derive from this class."""
pass


Expand Down
Loading

0 comments on commit ce07fba

Please sign in to comment.