Merge pull request #29 from CentreForDigitalHumanities/feature/fetch-…

…by-range Allow fetching of specific ranges
CentreForDigitalHumanities · Apr 24, 2024 · ce07fba · ce07fba
2 parents ac01783 + 4e7e610
commit ce07fba
Show file tree

Hide file tree

Showing 14 changed files with 339 additions and 147 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.vscode/
+.idea/
diff --git a/edpop_explorer/__init__.py b/edpop_explorer/__init__.py
@@ -2,7 +2,7 @@
     'EDPOPREC', 'RELATORS', 'bind_common_namespaces',
     'Field', 'FieldError', 'LocationField',
     'Reader', 'ReaderError', 'NotFoundError',
-    'GetByIdBasedOnQueryMixin', 'PreparedQuery', 'PreparedQueryType',
+    'GetByIdBasedOnQueryMixin', 'BasePreparedQuery', 'PreparedQueryType',
     'Record', 'RawData', 'RecordError', 'BibliographicalRecord',
     'BiographicalRecord', 'LazyRecordMixin',
     'SRUReader',
@@ -19,7 +19,7 @@
 from .rdf import EDPOPREC, RELATORS, bind_common_namespaces
 from .fields import Field, FieldError, LocationField
 from .reader import (
-    Reader, ReaderError, GetByIdBasedOnQueryMixin, PreparedQuery,
+    Reader, ReaderError, GetByIdBasedOnQueryMixin, BasePreparedQuery,
     PreparedQueryType, NotFoundError
 )
 from .record import (

diff --git a/edpop_explorer/reader.py b/edpop_explorer/reader.py
@@ -1,127 +1,175 @@
+"""Base reader class and strongly related functionality."""
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, List, Union
+from typing import Optional, Union, Dict
 from rdflib import Graph, RDF, URIRef
 from urllib.parse import quote, unquote
 
+
 from edpop_explorer import (
     EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces
 )
 from .record import Record
 
 
 @dataclass
-class PreparedQuery:
+class BasePreparedQuery:
+    """Empty base dataclass for prepared queries. For prepared queries that
+    can be represented by a single string, do not inherit from this class
+    but use a simple string instead."""
     pass
 
 
-PreparedQueryType = Union[str, PreparedQuery]
+PreparedQueryType = Union[str, BasePreparedQuery]
 
 
 class Reader(ABC):
-    '''Base reader class (abstract).
+    """Base reader class (abstract).
 
     This abstract base class provides a common interface for all readers.
-    To use, instantiate a subclass, set a query using the 
+    To use, instantiate a subclass, set a query using the
     ``prepare_query()`` or ``set_query()`` method, call ``fetch()``
     and subsequently ``fetch_next()`` until you have the number
     of results that you want. The attributes ``number_of_results``,
-    ``number_fetched`` and ``records`` will be updated after 
+    ``number_fetched`` and ``records`` will be updated after
     fetching.
 
-    To create a concrete reader, make a subclass that implements the 
-    ``fetch()``, ``fetch_next()`` and ``transform_query()`` methods
+    To create a concrete reader, make a subclass that implements the
+    ``fetch_range()`` and ``transform_query()`` methods
     and set the ``READERTYPE`` and ``CATALOG_URIREF`` attributes.
-    ``fetch()`` and ``fetch_next()`` should populate the 
-    ``records``, ``number_of_results`` and ``number_fetched``
-    attributes.
-    '''
+    ``fetch_range()`` should populate the ``records``, ``number_of_results``,
+    ``number_fetched`` and ``range_fetched`` attributes.
+    """
     number_of_results: Optional[int] = None
-    '''The total number of results for the query, including those
-    that have not been fetched yet.'''
-    number_fetched: int = 0
-    '''The number of results that has been fetched so far, or 0 if
-    no fetch has been performed yet.'''
-    records: List[Optional[Record]]
-    '''The records that have been fetched as instances of
-    (a subclass of) ``Record``.'''
+    """The total number of results for the query, or None if fetching
+    has not yet started and the number is not yet known."""
+    records: Dict[int, Record]
+    """The records that have been fetched as instances of
+    (a subclass of) ``Record``."""
     prepared_query: Optional[PreparedQueryType] = None
-    '''A transformed version of the query, available after
-    calling ``prepare_query()`` or ``set_query``.'''
+    """A transformed version of the query, available after
+    calling ``prepare_query()`` or ``set_query``."""
     READERTYPE: Optional[str] = None
-    '''The type of the reader, out of ``BIOGRAPHICAL`` and
-    ``BIBLIOGRAPHICAL`` (defined in the ``edpop_explorer`` package).'''
+    """The type of the reader, out of ``BIOGRAPHICAL`` and
+    ``BIBLIOGRAPHICAL`` (defined in the ``edpop_explorer`` package)."""
     CATALOG_URIREF: Optional[URIRef] = None
     IRI_PREFIX: Optional[str] = None
-    '''The prefix to use to create an IRI out of a record identifier.
+    """The prefix to use to create an IRI out of a record identifier.
     If an IRI cannot be created with a simple prefix, the 
     `identifier_to_iri` and `iri_to_identifier` methods have to be
-    overridden.'''
-    _graph: Optional[Graph] = None
+    overridden."""
+    FETCH_ALL_AT_ONCE = False
+    """True if the reader is configured to fetch all records at once,
+    even if the user only needs a subset."""
+    DEFAULT_RECORDS_PER_PAGE: int = 10
+    """The number of records to fetch at a time using the ``fetch()``
+    method if not determined by user."""
+    _fetch_position: int = 0
+    """The index of the record that was fetched last. This is used by
+    the ``fetch()`` method to decide where to continue fetching."""
 
     def __init__(self):
-        self.records = []
+        self.records = {}
 
     @classmethod
     @abstractmethod
     def transform_query(cls, query: str) -> PreparedQueryType:
-        '''Return a version of the query that is prepared for use in the
+        """Return a version of the query that is prepared for use in the
         API.
 
-        This method does not have to be called directly; instead 
-        ``prepare_query()`` can be used.'''
+        This method does not have to be called directly; instead
+        ``prepare_query()`` can be used."""
         pass
 
     def prepare_query(self, query: str) -> None:
-        '''Prepare a query for use by the reader's API. Updates the 
-        ``prepared_query`` attribute.'''
+        """Prepare a query for use by the reader's API. Updates the
+        ``prepared_query`` attribute."""
         self.prepared_query = self.transform_query(query)
 
     def set_query(self, query: PreparedQueryType) -> None:
-        '''Set an exact query. Updates the ``prepared_query``
-        attribute.'''
+        """Set an exact query. Updates the ``prepared_query``
+        attribute."""
         self.prepared_query = query
 
     def adjust_start_record(self, start_number: int) -> None:
-        """Skip the given number of first records and start fetching 
-        afterwards. Should be calling before the first time calling
-        ``fetch()``. The missing records in the ``records`` attribute
-        will be filled by ``None``s. The ``number_fetched`` attribute
-        will be adjusted as if the first records have been fetched.
-        This is mainly useful if the skipped records have already been 
-        fetched but the original ``Reader`` object is not available anymore. 
-        This functionality may be ignored by readers that can only load 
-        all records at once; generally these are readers that return lazy 
+        """Skip the given number of first records and start fetching
+        afterwards.
+
+        This functionality may be ignored by readers that can only load
+        all records at once; generally these are readers that return lazy
         records."""
-        if self.number_of_results is not None:
-            raise ReaderError(
-                "adjust_start_record should not be called after fetching."
-            )
-        self.number_fetched = start_number
-        self.records = [None for _ in range(start_number)]
+        self._fetch_position = start_number
 
-    @abstractmethod
     def fetch(
             self, number: Optional[int] = None
-    ):
-        '''Perform an initial or subsequent query. Most readers fetch
+    ) -> range:
+        """Perform an initial or subsequent query. Most readers fetch
         a limited number of records at once -- this number depends on
-        the reader but it may be adjusted using the ``number`` argument.
+        the reader but it may be adjusted using the ``number`` parameter.
         Other readers fetch all records at once and ignore the ``number``
-        argument. After fetching, the ``records`` and ``number_fetched``
-        attributes are adjusted and the ``number_of_results`` attribute
-        will be available.'''
+        parameter. After fetching, the records are available in the
+        ``records`` attribute and the ``number_of_results`` attribute
+        will be available. Returns the range of record indexes that has
+        been fetched."""
+        if self.fetching_exhausted:
+            return range(0)
+        if number is None:
+            number = self.DEFAULT_RECORDS_PER_PAGE
+        resulting_range = self.fetch_range(range(self._fetch_position,
+                                           self._fetch_position + number))
+        self._fetch_position = resulting_range.stop
+        return resulting_range
+
+    @abstractmethod
+    def fetch_range(self, range_to_fetch: range) -> range:
+        """Fetch a specific range of records. After fetching, the records
+        are available in the ``records`` attribute and the
+        ``number_of_results`` attribute will be available. If not all records
+        of the specified range exist, only the records that exist will be
+        fetched.
+
+        :param range_to_fetch: The range of records to fetch. ``step`` values
+            of ranges other than 1 are not supported and may be ignored.
+        :returns: The range of record indexes that has actually been fetched.
+        """
         pass
 
+    def get(self, index: int, allow_fetching: bool = True) -> Record:
+        """Get a record with a specific index. If the record is not yet
+        available, fetch additional records to make it available.
+
+        :param index: The number of the record to get.
+        :param allow_fetching: Allow fetching the record from an external
+            source if it was not yet fetched.
+        """
+        try:
+            return self.records[index]
+        except KeyError:
+            record = None
+            # Try to fetch, if it is allowed, and if there is a chance that
+            # it is successful (by verifying that index is not out of
+            # available range, if known)
+            if (allow_fetching and
+                    (self.number_of_results is None
+                     or self.number_of_results <= index)):
+                # Fetch and try again
+                self.fetch_range(range(index, index + 1))
+                record = self.records.get(index)
+            if record is not None:
+                return record
+            else:
+                raise NotFoundError(f"Item with index {index} is not available.")
+
     @classmethod
     @abstractmethod
     def get_by_id(cls, identifier: str) -> Record:
-        '''Get a single record by its identifier.'''
+        """Get a single record by its identifier."""
         pass
 
     @classmethod
     def get_by_iri(cls, iri: str) -> Record:
-        '''Get a single records by its IRI.'''
+        """Get a single records by its IRI."""
         identifier = cls.iri_to_identifier(iri)
         return cls.get_by_id(identifier)
 
@@ -170,18 +218,55 @@ def catalog_to_graph(cls) -> Graph:
 
         # Set namespace prefixes
         bind_common_namespaces(g)
-        
+
         return g
 
     @property
     def fetching_exhausted(self) -> bool:
-        """Return ``True`` if all results have been fetched. This is currently
-        implemented by simply checking if the ``number_of_results`` and
-        ``number_fetched`` attributes are equal."""
-        return self.number_fetched == self.number_of_results
+        """Return ``True`` if all results have been fetched."""
+        return self.fetching_started and self.number_of_results == self.number_fetched
+
+    @property
+    def fetching_started(self) -> bool:
+        """``True`` if fetching has started, otherwise ``False``. As soon
+        as fetching has started, changing the query is not possible anymore."""
+        return self.number_of_results is not None
+
+    @property
+    def number_fetched(self) -> int:
+        """The number of results that has been fetched so far, or 0 if
+        no fetch has been performed yet."""
+        return len(self.records)
+
+    def generate_identifier(self) -> str:
+        """Generate an identifier for this reader that is unique for the
+        combination of reader type and prepared query. This identifier can
+        be used when the reader has to be reused across sessions by
+        pickling and unpickling.
+
+        Note: while the identifier is guaranteed to be unique, there
+        is no guarantee that the generated identifier is the same for
+        every combination of reader type and prepared query."""
+        if self.prepared_query is None:
+            raise RuntimeError("A prepared query should be set first")
+        # Create identifier based on reader class name and prepared query.
+        readertype = self.__class__
+        # self.prepared_query is either a string or a dataclass instance,
+        # which means that it has a __str__ method that gives a unique
+        # string representation of its contents (at least as long as
+        # it does not contain a very complex data structure, which should
+        # not be the case). For dataclasses, it is not guaranteed
+        prepared_query = str(self.prepared_query)
+        return f"{readertype} | {prepared_query}"
 
 
 class GetByIdBasedOnQueryMixin(ABC):
+    """Mixin for readers that are based on an API that has no special
+    way of retrieving single records -- instead, these readers fetch
+    single records using a list query. To use, make sure to override
+    the ``_prepare_get_by_id_query`` method, which defines the list
+    query that should be used."""
+
     @classmethod
     def get_by_id(cls, identifier: str) -> Record:
         reader = cls()
@@ -190,13 +275,14 @@ def get_by_id(cls, identifier: str) -> Record:
         reader.set_query(cls._prepare_get_by_id_query(identifier))
         reader.fetch()
         if reader.number_of_results == 0:
-            raise ReaderError("No results returned")
-        for record in reader.records:
+            raise NotFoundError("No results returned")
+        for record in reader.records.values():
+            assert record is not None
             if record.identifier == identifier:
                 return record
         # Record with correct ID was not returned in first fetch -
         # give up.
-        raise ReaderError(
+        raise NotFoundError(
             f"Record with identifier {identifier} not present among "
             f"{reader.number_of_results} returned results."
         )
@@ -208,6 +294,8 @@ def _prepare_get_by_id_query(cls, identifier: str) -> PreparedQueryType:
 
 
 class ReaderError(Exception):
+    """Generic exception for failures in ``Reader`` class. More specific errors
+    derive from this class."""
     pass
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    .vscode/
+    .idea/