From 70669e358141d8a63fc7fb2fda9c1d53b1d2694e Mon Sep 17 00:00:00 2001
From: Lukas Schwab <lukas.schwab@gmail.com>
Date: Wed, 18 Oct 2023 18:43:19 -0700
Subject: [PATCH] Move package logic into __init__.py (#141)

---
 Makefile              |   4 +-
 arxiv/__init__.py     | 768 ++++++++++++++++++++++++++++++++++++++++-
 arxiv/arxiv.py        | 771 +-----------------------------------------
 tests/test_package.py |  30 ++
 4 files changed, 805 insertions(+), 768 deletions(-)
 create mode 100644 tests/test_package.py

diff --git a/Makefile b/Makefile
index 0a45d94..39717f0 100644
--- a/Makefile
+++ b/Makefile
@@ -20,9 +20,7 @@ audit:
 docs: docs/index.html
 docs/index.html: $(source) README.md
 	pdoc --version
-	pdoc --docformat "restructuredtext" ./arxiv/arxiv.py -o docs --no-search
-	mv docs/arxiv/arxiv.html docs/index.html
-	rmdir docs/arxiv
+	pdoc --docformat "restructuredtext" ./arxiv/__init__.py -o docs
 
 clean:
 	rm -rf build dist
diff --git a/arxiv/__init__.py b/arxiv/__init__.py
index 1540b51..9985c69 100644
--- a/arxiv/__init__.py
+++ b/arxiv/__init__.py
@@ -1 +1,767 @@
-from .arxiv import *  # noqa: F403
+""".. include:: ../README.md"""
+from __future__ import annotations
+
+import logging
+import time
+import itertools
+import feedparser
+import os
+import math
+import re
+import requests
+import warnings
+
+from urllib.parse import urlencode
+from urllib.request import urlretrieve
+from datetime import datetime, timedelta, timezone
+from calendar import timegm
+
+from enum import Enum
+from typing import Dict, Generator, List
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_TIME = datetime.min
+
+
+class Result(object):
+    """
+    An entry in an arXiv query results feed.
+
+    See [the arXiv API User's Manual: Details of Atom Results
+    Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned).
+    """
+
+    entry_id: str
+    """A url of the form `https://arxiv.org/abs/{id}`."""
+    updated: datetime
+    """When the result was last updated."""
+    published: datetime
+    """When the result was originally published."""
+    title: str
+    """The title of the result."""
+    authors: List[Author]
+    """The result's authors."""
+    summary: str
+    """The result abstract."""
+    comment: str
+    """The authors' comment if present."""
+    journal_ref: str
+    """A journal reference if present."""
+    doi: str
+    """A URL for the resolved DOI to an external resource if present."""
+    primary_category: str
+    """
+    The result's primary arXiv category. See [arXiv: Category
+    Taxonomy](https://arxiv.org/category_taxonomy).
+    """
+    categories: List[str]
+    """
+    All of the result's categories. See [arXiv: Category
+    Taxonomy](https://arxiv.org/category_taxonomy).
+    """
+    links: List[Link]
+    """Up to three URLs associated with this result."""
+    pdf_url: str
+    """The URL of a PDF version of this result if present among links."""
+    _raw: feedparser.FeedParserDict
+    """
+    The raw feedparser result object if this Result was constructed with
+    Result._from_feed_entry.
+    """
+
+    def __init__(
+        self,
+        entry_id: str,
+        updated: datetime = _DEFAULT_TIME,
+        published: datetime = _DEFAULT_TIME,
+        title: str = "",
+        authors: List[Author] = [],
+        summary: str = "",
+        comment: str = "",
+        journal_ref: str = "",
+        doi: str = "",
+        primary_category: str = "",
+        categories: List[str] = [],
+        links: List[Link] = [],
+        _raw: feedparser.FeedParserDict = None,
+    ):
+        """
+        Constructs an arXiv search result item.
+
+        In most cases, prefer using `Result._from_feed_entry` to parsing and
+        constructing `Result`s yourself.
+        """
+        self.entry_id = entry_id
+        self.updated = updated
+        self.published = published
+        self.title = title
+        self.authors = authors
+        self.summary = summary
+        self.comment = comment
+        self.journal_ref = journal_ref
+        self.doi = doi
+        self.primary_category = primary_category
+        self.categories = categories
+        self.links = links
+        # Calculated members
+        self.pdf_url = Result._get_pdf_url(links)
+        # Debugging
+        self._raw = _raw
+
+    def _from_feed_entry(entry: feedparser.FeedParserDict) -> Result:
+        """
+        Converts a feedparser entry for an arXiv search result feed into a
+        Result object.
+        """
+        if not hasattr(entry, "id"):
+            raise Result.MissingFieldError("id")
+        # Title attribute may be absent for certain titles. Defaulting to "0" as
+        # it's the only title observed to cause this bug.
+        # https://github.com/lukasschwab/arxiv.py/issues/71
+        # title = entry.title if hasattr(entry, "title") else "0"
+        title = "0"
+        if hasattr(entry, "title"):
+            title = entry.title
+        else:
+            logger.warning(
+                "Result %s is missing title attribute; defaulting to '0'", entry.id
+            )
+        return Result(
+            entry_id=entry.id,
+            updated=Result._to_datetime(entry.updated_parsed),
+            published=Result._to_datetime(entry.published_parsed),
+            title=re.sub(r"\s+", " ", title),
+            authors=[Result.Author._from_feed_author(a) for a in entry.authors],
+            summary=entry.summary,
+            comment=entry.get("arxiv_comment"),
+            journal_ref=entry.get("arxiv_journal_ref"),
+            doi=entry.get("arxiv_doi"),
+            primary_category=entry.arxiv_primary_category.get("term"),
+            categories=[tag.get("term") for tag in entry.tags],
+            links=[Result.Link._from_feed_link(link) for link in entry.links],
+            _raw=entry,
+        )
+
+    def __str__(self) -> str:
+        return self.entry_id
+
+    def __repr__(self) -> str:
+        return (
+            "{}(entry_id={}, updated={}, published={}, title={}, authors={}, "
+            "summary={}, comment={}, journal_ref={}, doi={}, "
+            "primary_category={}, categories={}, links={})"
+        ).format(
+            _classname(self),
+            repr(self.entry_id),
+            repr(self.updated),
+            repr(self.published),
+            repr(self.title),
+            repr(self.authors),
+            repr(self.summary),
+            repr(self.comment),
+            repr(self.journal_ref),
+            repr(self.doi),
+            repr(self.primary_category),
+            repr(self.categories),
+            repr(self.links),
+        )
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, Result):
+            return self.entry_id == other.entry_id
+        return False
+
+    def get_short_id(self) -> str:
+        """
+        Returns the short ID for this result.
+
+        + If the result URL is `"https://arxiv.org/abs/2107.05580v1"`,
+        `result.get_short_id()` returns `2107.05580v1`.
+
+        + If the result URL is `"https://arxiv.org/abs/quant-ph/0201082v1"`,
+        `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March
+        2007 arXiv identifier format).
+
+        For an explanation of the difference between arXiv's legacy and current
+        identifiers, see [Understanding the arXiv
+        identifier](https://arxiv.org/help/arxiv_identifier).
+        """
+        return self.entry_id.split("arxiv.org/abs/")[-1]
+
+    def _get_default_filename(self, extension: str = "pdf") -> str:
+        """
+        A default `to_filename` function for the extension given.
+        """
+        nonempty_title = self.title if self.title else "UNTITLED"
+        return ".".join(
+            [
+                self.get_short_id().replace("/", "_"),
+                re.sub(r"[^\w]", "_", nonempty_title),
+                extension,
+            ]
+        )
+
+    def download_pdf(self, dirpath: str = "./", filename: str = "") -> str:
+        """
+        Downloads the PDF for this result to the specified directory.
+
+        The filename is generated by calling `to_filename(self)`.
+        """
+        if not filename:
+            filename = self._get_default_filename()
+        path = os.path.join(dirpath, filename)
+        written_path, _ = urlretrieve(self.pdf_url, path)
+        return written_path
+
+    def download_source(self, dirpath: str = "./", filename: str = "") -> str:
+        """
+        Downloads the source tarfile for this result to the specified
+        directory.
+
+        The filename is generated by calling `to_filename(self)`.
+        """
+        if not filename:
+            filename = self._get_default_filename("tar.gz")
+        path = os.path.join(dirpath, filename)
+        # Bodge: construct the source URL from the PDF URL.
+        source_url = self.pdf_url.replace("/pdf/", "/src/")
+        written_path, _ = urlretrieve(source_url, path)
+        return written_path
+
+    def _get_pdf_url(links: List[Link]) -> str:
+        """
+        Finds the PDF link among a result's links and returns its URL.
+
+        Should only be called once for a given `Result`, in its constructor.
+        After construction, the URL should be available in `Result.pdf_url`.
+        """
+        pdf_urls = [link.href for link in links if link.title == "pdf"]
+        if len(pdf_urls) == 0:
+            return None
+        elif len(pdf_urls) > 1:
+            logger.warning("Result has multiple PDF links; using %s", pdf_urls[0])
+        return pdf_urls[0]
+
+    def _to_datetime(ts: time.struct_time) -> datetime:
+        """
+        Converts a UTC time.struct_time into a time-zone-aware datetime.
+
+        This will be replaced with feedparser functionality [when it becomes
+        available](https://github.com/kurtmckee/feedparser/issues/212).
+        """
+        return datetime.fromtimestamp(timegm(ts), tz=timezone.utc)
+
+    class Author(object):
+        """
+        A light inner class for representing a result's authors.
+        """
+
+        name: str
+        """The author's name."""
+
+        def __init__(self, name: str):
+            """
+            Constructs an `Author` with the specified name.
+
+            In most cases, prefer using `Author._from_feed_author` to parsing
+            and constructing `Author`s yourself.
+            """
+            self.name = name
+
+        def _from_feed_author(feed_author: feedparser.FeedParserDict) -> Result.Author:
+            """
+            Constructs an `Author` with the name specified in an author object
+            from a feed entry.
+
+            See usage in `Result._from_feed_entry`.
+            """
+            return Result.Author(feed_author.name)
+
+        def __str__(self) -> str:
+            return self.name
+
+        def __repr__(self) -> str:
+            return "{}({})".format(_classname(self), repr(self.name))
+
+        def __eq__(self, other) -> bool:
+            if isinstance(other, Result.Author):
+                return self.name == other.name
+            return False
+
+    class Link(object):
+        """
+        A light inner class for representing a result's links.
+        """
+
+        href: str
+        """The link's `href` attribute."""
+        title: str
+        """The link's title."""
+        rel: str
+        """The link's relationship to the `Result`."""
+        content_type: str
+        """The link's HTTP content type."""
+
+        def __init__(
+            self,
+            href: str,
+            title: str = None,
+            rel: str = None,
+            content_type: str = None,
+        ):
+            """
+            Constructs a `Link` with the specified link metadata.
+
+            In most cases, prefer using `Link._from_feed_link` to parsing and
+            constructing `Link`s yourself.
+            """
+            self.href = href
+            self.title = title
+            self.rel = rel
+            self.content_type = content_type
+
+        def _from_feed_link(feed_link: feedparser.FeedParserDict) -> Result.Link:
+            """
+            Constructs a `Link` with link metadata specified in a link object
+            from a feed entry.
+
+            See usage in `Result._from_feed_entry`.
+            """
+            return Result.Link(
+                href=feed_link.href,
+                title=feed_link.get("title"),
+                rel=feed_link.get("rel"),
+                content_type=feed_link.get("content_type"),
+            )
+
+        def __str__(self) -> str:
+            return self.href
+
+        def __repr__(self) -> str:
+            return "{}({}, title={}, rel={}, content_type={})".format(
+                _classname(self),
+                repr(self.href),
+                repr(self.title),
+                repr(self.rel),
+                repr(self.content_type),
+            )
+
+        def __eq__(self, other) -> bool:
+            if isinstance(other, Result.Link):
+                return self.href == other.href
+            return False
+
+    class MissingFieldError(Exception):
+        """
+        An error indicating an entry is unparseable because it lacks required
+        fields.
+        """
+
+        missing_field: str
+        """The required field missing from the would-be entry."""
+        message: str
+        """Message describing what caused this error."""
+
+        def __init__(self, missing_field):
+            self.missing_field = missing_field
+            self.message = "Entry from arXiv missing required info"
+
+        def __repr__(self) -> str:
+            return "{}({})".format(_classname(self), repr(self.missing_field))
+
+
+class SortCriterion(Enum):
+    """
+    A SortCriterion identifies a property by which search results can be
+    sorted.
+
+    See [the arXiv API User's Manual: sort order for return
+    results](https://arxiv.org/help/api/user-manual#sort).
+    """
+
+    Relevance = "relevance"
+    LastUpdatedDate = "lastUpdatedDate"
+    SubmittedDate = "submittedDate"
+
+
+class SortOrder(Enum):
+    """
+    A SortOrder indicates order in which search results are sorted according
+    to the specified arxiv.SortCriterion.
+
+    See [the arXiv API User's Manual: sort order for return
+    results](https://arxiv.org/help/api/user-manual#sort).
+    """
+
+    Ascending = "ascending"
+    Descending = "descending"
+
+
+class Search(object):
+    """
+    A specification for a search of arXiv's database.
+
+    To run a search, use `Search.run` to use a default client or `Client.run`
+    with a specific client.
+    """
+
+    query: str
+    """
+    A query string.
+
+    This should be unencoded. Use `au:del_maestro AND ti:checkerboard`, not
+    `au:del_maestro+AND+ti:checkerboard`.
+
+    See [the arXiv API User's Manual: Details of Query
+    Construction](https://arxiv.org/help/api/user-manual#query_details).
+    """
+    id_list: List[str]
+    """
+    A list of arXiv article IDs to which to limit the search.
+
+    See [the arXiv API User's
+    Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list)
+    for documentation of the interaction between `query` and `id_list`.
+    """
+    max_results: int | None
+    """
+    The maximum number of results to be returned in an execution of this
+    search.
+
+    To fetch every result available, set `max_results=None`.
+    """
+    sort_by: SortCriterion
+    """The sort criterion for results."""
+    sort_order: SortOrder
+    """The sort order for results."""
+
+    def __init__(
+        self,
+        query: str = "",
+        id_list: List[str] = [],
+        max_results: int | None = None,
+        sort_by: SortCriterion = SortCriterion.Relevance,
+        sort_order: SortOrder = SortOrder.Descending,
+    ):
+        """
+        Constructs an arXiv API search with the specified criteria.
+        """
+        self.query = query
+        self.id_list = id_list
+        # Handle deprecated v1 default behavior.
+        self.max_results = None if max_results == math.inf else max_results
+        self.sort_by = sort_by
+        self.sort_order = sort_order
+
+    def __str__(self) -> str:
+        # TODO: develop a more informative string representation.
+        return repr(self)
+
+    def __repr__(self) -> str:
+        return (
+            "{}(query={}, id_list={}, max_results={}, sort_by={}, " "sort_order={})"
+        ).format(
+            _classname(self),
+            repr(self.query),
+            repr(self.id_list),
+            repr(self.max_results),
+            repr(self.sort_by),
+            repr(self.sort_order),
+        )
+
+    def _url_args(self) -> Dict[str, str]:
+        """
+        Returns a dict of search parameters that should be included in an API
+        request for this search.
+        """
+        return {
+            "search_query": self.query,
+            "id_list": ",".join(self.id_list),
+            "sortBy": self.sort_by.value,
+            "sortOrder": self.sort_order.value,
+        }
+
+    def results(self, offset: int = 0) -> Generator[Result, None, None]:
+        """
+        Executes the specified search using a default arXiv API client.
+
+        For info on default behavior, see `Client.__init__` and `Client.results`.
+
+        **Deprecated** after 2.0.0; use `Client.results`.
+        """
+        warnings.warn(
+            "The '(Search).results' method is deprecated, use 'Client.results' instead",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return Client().results(self, offset=offset)
+
+
+class Client(object):
+    """
+    Specifies a strategy for fetching results from arXiv's API.
+
+    This class obscures pagination and retry logic, and exposes
+    `Client.results`.
+    """
+
+    query_url_format = "https://export.arxiv.org/api/query?{}"
+    """The arXiv query API endpoint format."""
+    page_size: int
+    """Maximum number of results fetched in a single API request."""
+    delay_seconds: float
+    """Number of seconds to wait between API requests."""
+    num_retries: int
+    """Number of times to retry a failing API request."""
+
+    _last_request_dt: datetime
+    _session: requests.Session
+
+    def __init__(
+        self, page_size: int = 100, delay_seconds: float = 3.0, num_retries: int = 3
+    ):
+        """
+        Constructs an arXiv API client with the specified options.
+
+        Note: the default parameters should provide a robust request strategy
+        for most use cases. Extreme page sizes, delays, or retries risk
+        violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou),
+        brittle behavior, and inconsistent results.
+        """
+        self.page_size = page_size
+        self.delay_seconds = delay_seconds
+        self.num_retries = num_retries
+        self._last_request_dt = None
+        self._session = requests.Session()
+
+    def __str__(self) -> str:
+        # TODO: develop a more informative string representation.
+        return repr(self)
+
+    def __repr__(self) -> str:
+        return "{}(page_size={}, delay_seconds={}, num_retries={})".format(
+            _classname(self),
+            repr(self.page_size),
+            repr(self.delay_seconds),
+            repr(self.num_retries),
+        )
+
+    def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]:
+        """
+        Uses this client configuration to fetch one page of the search results
+        at a time, yielding the parsed `Result`s, until `max_results` results
+        have been yielded or there are no more search results.
+
+        If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`.
+
+        Setting a nonzero `offset` discards leading records in the result set.
+        When `offset` is greater than or equal to `search.max_results`, the full
+        result set is discarded.
+
+        For more on using generators, see
+        [Generators](https://wiki.python.org/moin/Generators).
+        """
+        limit = search.max_results - offset if search.max_results else None
+        if limit and limit < 0:
+            return iter(())
+        return itertools.islice(self._results(search, offset), limit)
+
+    def _results(
+        self, search: Search, offset: int = 0
+    ) -> Generator[Result, None, None]:
+        page_url = self._format_url(search, offset, self.page_size)
+        feed = self._parse_feed(page_url, first_page=True)
+        if not feed.entries:
+            logger.info("Got empty first page; stopping generation")
+            return
+        total_results = int(feed.feed.opensearch_totalresults)
+        logger.info(
+            "Got first page: %d of %d total results",
+            len(feed.entries),
+            total_results,
+        )
+
+        while feed.entries:
+            for entry in feed.entries:
+                try:
+                    yield Result._from_feed_entry(entry)
+                except Result.MissingFieldError as e:
+                    logger.warning("Skipping partial result: %s", e)
+            offset += len(feed.entries)
+            if offset >= total_results:
+                break
+            page_url = self._format_url(search, offset, self.page_size)
+            feed = self._parse_feed(page_url, first_page=False)
+
+    def _format_url(self, search: Search, start: int, page_size: int) -> str:
+        """
+        Construct a request API for search that returns up to `page_size`
+        results starting with the result at index `start`.
+        """
+        url_args = search._url_args()
+        url_args.update(
+            {
+                "start": start,
+                "max_results": page_size,
+            }
+        )
+        return self.query_url_format.format(urlencode(url_args))
+
+    def _parse_feed(
+        self, url: str, first_page: bool = True, _try_index: int = 0
+    ) -> feedparser.FeedParserDict:
+        """
+        Fetches the specified URL and parses it with feedparser.
+
+        If a request fails or is unexpectedly empty, retries the request up to
+        `self.num_retries` times.
+        """
+        try:
+            return self.__try_parse_feed(
+                url, first_page=first_page, try_index=_try_index
+            )
+        except (
+            HTTPError,
+            UnexpectedEmptyPageError,
+            requests.exceptions.ConnectionError,
+        ) as err:
+            if _try_index < self.num_retries:
+                logger.debug("Got error (try %d): %s", _try_index, err)
+                return self._parse_feed(
+                    url, first_page=first_page, _try_index=_try_index + 1
+                )
+            logger.debug("Giving up (try %d): %s", _try_index, err)
+            raise err
+
+    def __try_parse_feed(
+        self,
+        url: str,
+        first_page: bool,
+        try_index: int,
+    ) -> feedparser.FeedParserDict:
+        """
+        Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that
+        number of seconds has not passed since `_parse_feed` was last called,
+        sleeps until delay_seconds seconds have passed.
+        """
+        # If this call would violate the rate limit, sleep until it doesn't.
+        if self._last_request_dt is not None:
+            required = timedelta(seconds=self.delay_seconds)
+            since_last_request = datetime.now() - self._last_request_dt
+            if since_last_request < required:
+                to_sleep = (required - since_last_request).total_seconds()
+                logger.info("Sleeping: %f seconds", to_sleep)
+                time.sleep(to_sleep)
+
+        logger.info(
+            "Requesting page (first: %r, try: %d): %s", first_page, try_index, url
+        )
+
+        resp = self._session.get(url, headers={"user-agent": "arxiv.py/2.0.0"})
+        self._last_request_dt = datetime.now()
+        if resp.status_code != requests.codes.OK:
+            raise HTTPError(url, try_index, resp.status_code)
+
+        feed = feedparser.parse(resp.content)
+        if len(feed.entries) == 0 and not first_page:
+            raise UnexpectedEmptyPageError(url, try_index, feed)
+
+        if feed.bozo:
+            logger.warning(
+                "Bozo feed; consider handling: %s",
+                feed.bozo_exception if "bozo_exception" in feed else None,
+            )
+
+        return feed
+
+
+class ArxivError(Exception):
+    """This package's base Exception class."""
+
+    url: str
+    """The feed URL that could not be fetched."""
+    retry: int
+    """
+    The request try number which encountered this error; 0 for the initial try,
+    1 for the first retry, and so on.
+    """
+    message: str
+    """Message describing what caused this error."""
+
+    def __init__(self, url: str, retry: int, message: str):
+        """
+        Constructs an `ArxivError` encountered while fetching the specified URL.
+        """
+        self.url = url
+        self.retry = retry
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self) -> str:
+        return "{} ({})".format(self.message, self.url)
+
+
+class UnexpectedEmptyPageError(ArxivError):
+    """
+    An error raised when a page of results that should be non-empty is empty.
+
+    This should never happen in theory, but happens sporadically due to
+    brittleness in the underlying arXiv API; usually resolved by retries.
+
+    See `Client.results` for usage.
+    """
+
+    raw_feed: feedparser.FeedParserDict
+    """
+    The raw output of `feedparser.parse`. Sometimes this contains useful
+    diagnostic information, e.g. in 'bozo_exception'.
+    """
+
+    def __init__(self, url: str, retry: int, raw_feed: feedparser.FeedParserDict):
+        """
+        Constructs an `UnexpectedEmptyPageError` encountered for the specified
+        API URL after `retry` tries.
+        """
+        self.url = url
+        self.raw_feed = raw_feed
+        super().__init__(url, retry, "Page of results was unexpectedly empty")
+
+    def __repr__(self) -> str:
+        return "{}({}, {}, {})".format(
+            _classname(self), repr(self.url), repr(self.retry), repr(self.raw_feed)
+        )
+
+
+class HTTPError(ArxivError):
+    """
+    A non-200 status encountered while fetching a page of results.
+
+    See `Client.results` for usage.
+    """
+
+    status: int
+    """The HTTP status reported by feedparser."""
+
+    def __init__(self, url: str, retry: int, status: int):
+        """
+        Constructs an `HTTPError` for the specified status code, encountered for
+        the specified API URL after `retry` tries.
+        """
+        self.url = url
+        self.status = status
+        super().__init__(
+            url,
+            retry,
+            "Page request resulted in HTTP {}".format(self.status),
+        )
+
+    def __repr__(self) -> str:
+        return "{}({}, {}, {})".format(
+            _classname(self), repr(self.url), repr(self.retry), repr(self.status)
+        )
+
+
+def _classname(o):
+    """A helper function for use in __repr__ methods: arxiv.Result.Link."""
+    return "arxiv.{}".format(o.__class__.__qualname__)
diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py
index 9985c69..36a1862 100644
--- a/arxiv/arxiv.py
+++ b/arxiv/arxiv.py
@@ -1,767 +1,10 @@
-""".. include:: ../README.md"""
-from __future__ import annotations
+"""
+This submodule is only an alias included for backwards compatibility. Its use is
+deprecated as of 2.0.0.
 
-import logging
-import time
-import itertools
-import feedparser
-import os
-import math
-import re
-import requests
+Use `import arxiv`.
+"""
+from .__init__ import *  # noqa: F403
 import warnings
 
-from urllib.parse import urlencode
-from urllib.request import urlretrieve
-from datetime import datetime, timedelta, timezone
-from calendar import timegm
-
-from enum import Enum
-from typing import Dict, Generator, List
-
-logger = logging.getLogger(__name__)
-
-_DEFAULT_TIME = datetime.min
-
-
-class Result(object):
-    """
-    An entry in an arXiv query results feed.
-
-    See [the arXiv API User's Manual: Details of Atom Results
-    Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned).
-    """
-
-    entry_id: str
-    """A url of the form `https://arxiv.org/abs/{id}`."""
-    updated: datetime
-    """When the result was last updated."""
-    published: datetime
-    """When the result was originally published."""
-    title: str
-    """The title of the result."""
-    authors: List[Author]
-    """The result's authors."""
-    summary: str
-    """The result abstract."""
-    comment: str
-    """The authors' comment if present."""
-    journal_ref: str
-    """A journal reference if present."""
-    doi: str
-    """A URL for the resolved DOI to an external resource if present."""
-    primary_category: str
-    """
-    The result's primary arXiv category. See [arXiv: Category
-    Taxonomy](https://arxiv.org/category_taxonomy).
-    """
-    categories: List[str]
-    """
-    All of the result's categories. See [arXiv: Category
-    Taxonomy](https://arxiv.org/category_taxonomy).
-    """
-    links: List[Link]
-    """Up to three URLs associated with this result."""
-    pdf_url: str
-    """The URL of a PDF version of this result if present among links."""
-    _raw: feedparser.FeedParserDict
-    """
-    The raw feedparser result object if this Result was constructed with
-    Result._from_feed_entry.
-    """
-
-    def __init__(
-        self,
-        entry_id: str,
-        updated: datetime = _DEFAULT_TIME,
-        published: datetime = _DEFAULT_TIME,
-        title: str = "",
-        authors: List[Author] = [],
-        summary: str = "",
-        comment: str = "",
-        journal_ref: str = "",
-        doi: str = "",
-        primary_category: str = "",
-        categories: List[str] = [],
-        links: List[Link] = [],
-        _raw: feedparser.FeedParserDict = None,
-    ):
-        """
-        Constructs an arXiv search result item.
-
-        In most cases, prefer using `Result._from_feed_entry` to parsing and
-        constructing `Result`s yourself.
-        """
-        self.entry_id = entry_id
-        self.updated = updated
-        self.published = published
-        self.title = title
-        self.authors = authors
-        self.summary = summary
-        self.comment = comment
-        self.journal_ref = journal_ref
-        self.doi = doi
-        self.primary_category = primary_category
-        self.categories = categories
-        self.links = links
-        # Calculated members
-        self.pdf_url = Result._get_pdf_url(links)
-        # Debugging
-        self._raw = _raw
-
-    def _from_feed_entry(entry: feedparser.FeedParserDict) -> Result:
-        """
-        Converts a feedparser entry for an arXiv search result feed into a
-        Result object.
-        """
-        if not hasattr(entry, "id"):
-            raise Result.MissingFieldError("id")
-        # Title attribute may be absent for certain titles. Defaulting to "0" as
-        # it's the only title observed to cause this bug.
-        # https://github.com/lukasschwab/arxiv.py/issues/71
-        # title = entry.title if hasattr(entry, "title") else "0"
-        title = "0"
-        if hasattr(entry, "title"):
-            title = entry.title
-        else:
-            logger.warning(
-                "Result %s is missing title attribute; defaulting to '0'", entry.id
-            )
-        return Result(
-            entry_id=entry.id,
-            updated=Result._to_datetime(entry.updated_parsed),
-            published=Result._to_datetime(entry.published_parsed),
-            title=re.sub(r"\s+", " ", title),
-            authors=[Result.Author._from_feed_author(a) for a in entry.authors],
-            summary=entry.summary,
-            comment=entry.get("arxiv_comment"),
-            journal_ref=entry.get("arxiv_journal_ref"),
-            doi=entry.get("arxiv_doi"),
-            primary_category=entry.arxiv_primary_category.get("term"),
-            categories=[tag.get("term") for tag in entry.tags],
-            links=[Result.Link._from_feed_link(link) for link in entry.links],
-            _raw=entry,
-        )
-
-    def __str__(self) -> str:
-        return self.entry_id
-
-    def __repr__(self) -> str:
-        return (
-            "{}(entry_id={}, updated={}, published={}, title={}, authors={}, "
-            "summary={}, comment={}, journal_ref={}, doi={}, "
-            "primary_category={}, categories={}, links={})"
-        ).format(
-            _classname(self),
-            repr(self.entry_id),
-            repr(self.updated),
-            repr(self.published),
-            repr(self.title),
-            repr(self.authors),
-            repr(self.summary),
-            repr(self.comment),
-            repr(self.journal_ref),
-            repr(self.doi),
-            repr(self.primary_category),
-            repr(self.categories),
-            repr(self.links),
-        )
-
-    def __eq__(self, other) -> bool:
-        if isinstance(other, Result):
-            return self.entry_id == other.entry_id
-        return False
-
-    def get_short_id(self) -> str:
-        """
-        Returns the short ID for this result.
-
-        + If the result URL is `"https://arxiv.org/abs/2107.05580v1"`,
-        `result.get_short_id()` returns `2107.05580v1`.
-
-        + If the result URL is `"https://arxiv.org/abs/quant-ph/0201082v1"`,
-        `result.get_short_id()` returns `"quant-ph/0201082v1"` (the pre-March
-        2007 arXiv identifier format).
-
-        For an explanation of the difference between arXiv's legacy and current
-        identifiers, see [Understanding the arXiv
-        identifier](https://arxiv.org/help/arxiv_identifier).
-        """
-        return self.entry_id.split("arxiv.org/abs/")[-1]
-
-    def _get_default_filename(self, extension: str = "pdf") -> str:
-        """
-        A default `to_filename` function for the extension given.
-        """
-        nonempty_title = self.title if self.title else "UNTITLED"
-        return ".".join(
-            [
-                self.get_short_id().replace("/", "_"),
-                re.sub(r"[^\w]", "_", nonempty_title),
-                extension,
-            ]
-        )
-
-    def download_pdf(self, dirpath: str = "./", filename: str = "") -> str:
-        """
-        Downloads the PDF for this result to the specified directory.
-
-        The filename is generated by calling `to_filename(self)`.
-        """
-        if not filename:
-            filename = self._get_default_filename()
-        path = os.path.join(dirpath, filename)
-        written_path, _ = urlretrieve(self.pdf_url, path)
-        return written_path
-
-    def download_source(self, dirpath: str = "./", filename: str = "") -> str:
-        """
-        Downloads the source tarfile for this result to the specified
-        directory.
-
-        The filename is generated by calling `to_filename(self)`.
-        """
-        if not filename:
-            filename = self._get_default_filename("tar.gz")
-        path = os.path.join(dirpath, filename)
-        # Bodge: construct the source URL from the PDF URL.
-        source_url = self.pdf_url.replace("/pdf/", "/src/")
-        written_path, _ = urlretrieve(source_url, path)
-        return written_path
-
-    def _get_pdf_url(links: List[Link]) -> str:
-        """
-        Finds the PDF link among a result's links and returns its URL.
-
-        Should only be called once for a given `Result`, in its constructor.
-        After construction, the URL should be available in `Result.pdf_url`.
-        """
-        pdf_urls = [link.href for link in links if link.title == "pdf"]
-        if len(pdf_urls) == 0:
-            return None
-        elif len(pdf_urls) > 1:
-            logger.warning("Result has multiple PDF links; using %s", pdf_urls[0])
-        return pdf_urls[0]
-
-    def _to_datetime(ts: time.struct_time) -> datetime:
-        """
-        Converts a UTC time.struct_time into a time-zone-aware datetime.
-
-        This will be replaced with feedparser functionality [when it becomes
-        available](https://github.com/kurtmckee/feedparser/issues/212).
-        """
-        return datetime.fromtimestamp(timegm(ts), tz=timezone.utc)
-
-    class Author(object):
-        """
-        A light inner class for representing a result's authors.
-        """
-
-        name: str
-        """The author's name."""
-
-        def __init__(self, name: str):
-            """
-            Constructs an `Author` with the specified name.
-
-            In most cases, prefer using `Author._from_feed_author` to parsing
-            and constructing `Author`s yourself.
-            """
-            self.name = name
-
-        def _from_feed_author(feed_author: feedparser.FeedParserDict) -> Result.Author:
-            """
-            Constructs an `Author` with the name specified in an author object
-            from a feed entry.
-
-            See usage in `Result._from_feed_entry`.
-            """
-            return Result.Author(feed_author.name)
-
-        def __str__(self) -> str:
-            return self.name
-
-        def __repr__(self) -> str:
-            return "{}({})".format(_classname(self), repr(self.name))
-
-        def __eq__(self, other) -> bool:
-            if isinstance(other, Result.Author):
-                return self.name == other.name
-            return False
-
-    class Link(object):
-        """
-        A light inner class for representing a result's links.
-        """
-
-        href: str
-        """The link's `href` attribute."""
-        title: str
-        """The link's title."""
-        rel: str
-        """The link's relationship to the `Result`."""
-        content_type: str
-        """The link's HTTP content type."""
-
-        def __init__(
-            self,
-            href: str,
-            title: str = None,
-            rel: str = None,
-            content_type: str = None,
-        ):
-            """
-            Constructs a `Link` with the specified link metadata.
-
-            In most cases, prefer using `Link._from_feed_link` to parsing and
-            constructing `Link`s yourself.
-            """
-            self.href = href
-            self.title = title
-            self.rel = rel
-            self.content_type = content_type
-
-        def _from_feed_link(feed_link: feedparser.FeedParserDict) -> Result.Link:
-            """
-            Constructs a `Link` with link metadata specified in a link object
-            from a feed entry.
-
-            See usage in `Result._from_feed_entry`.
-            """
-            return Result.Link(
-                href=feed_link.href,
-                title=feed_link.get("title"),
-                rel=feed_link.get("rel"),
-                content_type=feed_link.get("content_type"),
-            )
-
-        def __str__(self) -> str:
-            return self.href
-
-        def __repr__(self) -> str:
-            return "{}({}, title={}, rel={}, content_type={})".format(
-                _classname(self),
-                repr(self.href),
-                repr(self.title),
-                repr(self.rel),
-                repr(self.content_type),
-            )
-
-        def __eq__(self, other) -> bool:
-            if isinstance(other, Result.Link):
-                return self.href == other.href
-            return False
-
-    class MissingFieldError(Exception):
-        """
-        An error indicating an entry is unparseable because it lacks required
-        fields.
-        """
-
-        missing_field: str
-        """The required field missing from the would-be entry."""
-        message: str
-        """Message describing what caused this error."""
-
-        def __init__(self, missing_field):
-            self.missing_field = missing_field
-            self.message = "Entry from arXiv missing required info"
-
-        def __repr__(self) -> str:
-            return "{}({})".format(_classname(self), repr(self.missing_field))
-
-
-class SortCriterion(Enum):
-    """
-    A SortCriterion identifies a property by which search results can be
-    sorted.
-
-    See [the arXiv API User's Manual: sort order for return
-    results](https://arxiv.org/help/api/user-manual#sort).
-    """
-
-    Relevance = "relevance"
-    LastUpdatedDate = "lastUpdatedDate"
-    SubmittedDate = "submittedDate"
-
-
-class SortOrder(Enum):
-    """
-    A SortOrder indicates order in which search results are sorted according
-    to the specified arxiv.SortCriterion.
-
-    See [the arXiv API User's Manual: sort order for return
-    results](https://arxiv.org/help/api/user-manual#sort).
-    """
-
-    Ascending = "ascending"
-    Descending = "descending"
-
-
-class Search(object):
-    """
-    A specification for a search of arXiv's database.
-
-    To run a search, use `Search.run` to use a default client or `Client.run`
-    with a specific client.
-    """
-
-    query: str
-    """
-    A query string.
-
-    This should be unencoded. Use `au:del_maestro AND ti:checkerboard`, not
-    `au:del_maestro+AND+ti:checkerboard`.
-
-    See [the arXiv API User's Manual: Details of Query
-    Construction](https://arxiv.org/help/api/user-manual#query_details).
-    """
-    id_list: List[str]
-    """
-    A list of arXiv article IDs to which to limit the search.
-
-    See [the arXiv API User's
-    Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list)
-    for documentation of the interaction between `query` and `id_list`.
-    """
-    max_results: int | None
-    """
-    The maximum number of results to be returned in an execution of this
-    search.
-
-    To fetch every result available, set `max_results=None`.
-    """
-    sort_by: SortCriterion
-    """The sort criterion for results."""
-    sort_order: SortOrder
-    """The sort order for results."""
-
-    def __init__(
-        self,
-        query: str = "",
-        id_list: List[str] = [],
-        max_results: int | None = None,
-        sort_by: SortCriterion = SortCriterion.Relevance,
-        sort_order: SortOrder = SortOrder.Descending,
-    ):
-        """
-        Constructs an arXiv API search with the specified criteria.
-        """
-        self.query = query
-        self.id_list = id_list
-        # Handle deprecated v1 default behavior.
-        self.max_results = None if max_results == math.inf else max_results
-        self.sort_by = sort_by
-        self.sort_order = sort_order
-
-    def __str__(self) -> str:
-        # TODO: develop a more informative string representation.
-        return repr(self)
-
-    def __repr__(self) -> str:
-        return (
-            "{}(query={}, id_list={}, max_results={}, sort_by={}, " "sort_order={})"
-        ).format(
-            _classname(self),
-            repr(self.query),
-            repr(self.id_list),
-            repr(self.max_results),
-            repr(self.sort_by),
-            repr(self.sort_order),
-        )
-
-    def _url_args(self) -> Dict[str, str]:
-        """
-        Returns a dict of search parameters that should be included in an API
-        request for this search.
-        """
-        return {
-            "search_query": self.query,
-            "id_list": ",".join(self.id_list),
-            "sortBy": self.sort_by.value,
-            "sortOrder": self.sort_order.value,
-        }
-
-    def results(self, offset: int = 0) -> Generator[Result, None, None]:
-        """
-        Executes the specified search using a default arXiv API client.
-
-        For info on default behavior, see `Client.__init__` and `Client.results`.
-
-        **Deprecated** after 2.0.0; use `Client.results`.
-        """
-        warnings.warn(
-            "The '(Search).results' method is deprecated, use 'Client.results' instead",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return Client().results(self, offset=offset)
-
-
-class Client(object):
-    """
-    Specifies a strategy for fetching results from arXiv's API.
-
-    This class obscures pagination and retry logic, and exposes
-    `Client.results`.
-    """
-
-    query_url_format = "https://export.arxiv.org/api/query?{}"
-    """The arXiv query API endpoint format."""
-    page_size: int
-    """Maximum number of results fetched in a single API request."""
-    delay_seconds: float
-    """Number of seconds to wait between API requests."""
-    num_retries: int
-    """Number of times to retry a failing API request."""
-
-    _last_request_dt: datetime
-    _session: requests.Session
-
-    def __init__(
-        self, page_size: int = 100, delay_seconds: float = 3.0, num_retries: int = 3
-    ):
-        """
-        Constructs an arXiv API client with the specified options.
-
-        Note: the default parameters should provide a robust request strategy
-        for most use cases. Extreme page sizes, delays, or retries risk
-        violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou),
-        brittle behavior, and inconsistent results.
-        """
-        self.page_size = page_size
-        self.delay_seconds = delay_seconds
-        self.num_retries = num_retries
-        self._last_request_dt = None
-        self._session = requests.Session()
-
-    def __str__(self) -> str:
-        # TODO: develop a more informative string representation.
-        return repr(self)
-
-    def __repr__(self) -> str:
-        return "{}(page_size={}, delay_seconds={}, num_retries={})".format(
-            _classname(self),
-            repr(self.page_size),
-            repr(self.delay_seconds),
-            repr(self.num_retries),
-        )
-
-    def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]:
-        """
-        Uses this client configuration to fetch one page of the search results
-        at a time, yielding the parsed `Result`s, until `max_results` results
-        have been yielded or there are no more search results.
-
-        If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`.
-
-        Setting a nonzero `offset` discards leading records in the result set.
-        When `offset` is greater than or equal to `search.max_results`, the full
-        result set is discarded.
-
-        For more on using generators, see
-        [Generators](https://wiki.python.org/moin/Generators).
-        """
-        limit = search.max_results - offset if search.max_results else None
-        if limit and limit < 0:
-            return iter(())
-        return itertools.islice(self._results(search, offset), limit)
-
-    def _results(
-        self, search: Search, offset: int = 0
-    ) -> Generator[Result, None, None]:
-        page_url = self._format_url(search, offset, self.page_size)
-        feed = self._parse_feed(page_url, first_page=True)
-        if not feed.entries:
-            logger.info("Got empty first page; stopping generation")
-            return
-        total_results = int(feed.feed.opensearch_totalresults)
-        logger.info(
-            "Got first page: %d of %d total results",
-            len(feed.entries),
-            total_results,
-        )
-
-        while feed.entries:
-            for entry in feed.entries:
-                try:
-                    yield Result._from_feed_entry(entry)
-                except Result.MissingFieldError as e:
-                    logger.warning("Skipping partial result: %s", e)
-            offset += len(feed.entries)
-            if offset >= total_results:
-                break
-            page_url = self._format_url(search, offset, self.page_size)
-            feed = self._parse_feed(page_url, first_page=False)
-
-    def _format_url(self, search: Search, start: int, page_size: int) -> str:
-        """
-        Construct a request API for search that returns up to `page_size`
-        results starting with the result at index `start`.
-        """
-        url_args = search._url_args()
-        url_args.update(
-            {
-                "start": start,
-                "max_results": page_size,
-            }
-        )
-        return self.query_url_format.format(urlencode(url_args))
-
-    def _parse_feed(
-        self, url: str, first_page: bool = True, _try_index: int = 0
-    ) -> feedparser.FeedParserDict:
-        """
-        Fetches the specified URL and parses it with feedparser.
-
-        If a request fails or is unexpectedly empty, retries the request up to
-        `self.num_retries` times.
-        """
-        try:
-            return self.__try_parse_feed(
-                url, first_page=first_page, try_index=_try_index
-            )
-        except (
-            HTTPError,
-            UnexpectedEmptyPageError,
-            requests.exceptions.ConnectionError,
-        ) as err:
-            if _try_index < self.num_retries:
-                logger.debug("Got error (try %d): %s", _try_index, err)
-                return self._parse_feed(
-                    url, first_page=first_page, _try_index=_try_index + 1
-                )
-            logger.debug("Giving up (try %d): %s", _try_index, err)
-            raise err
-
-    def __try_parse_feed(
-        self,
-        url: str,
-        first_page: bool,
-        try_index: int,
-    ) -> feedparser.FeedParserDict:
-        """
-        Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that
-        number of seconds has not passed since `_parse_feed` was last called,
-        sleeps until delay_seconds seconds have passed.
-        """
-        # If this call would violate the rate limit, sleep until it doesn't.
-        if self._last_request_dt is not None:
-            required = timedelta(seconds=self.delay_seconds)
-            since_last_request = datetime.now() - self._last_request_dt
-            if since_last_request < required:
-                to_sleep = (required - since_last_request).total_seconds()
-                logger.info("Sleeping: %f seconds", to_sleep)
-                time.sleep(to_sleep)
-
-        logger.info(
-            "Requesting page (first: %r, try: %d): %s", first_page, try_index, url
-        )
-
-        resp = self._session.get(url, headers={"user-agent": "arxiv.py/2.0.0"})
-        self._last_request_dt = datetime.now()
-        if resp.status_code != requests.codes.OK:
-            raise HTTPError(url, try_index, resp.status_code)
-
-        feed = feedparser.parse(resp.content)
-        if len(feed.entries) == 0 and not first_page:
-            raise UnexpectedEmptyPageError(url, try_index, feed)
-
-        if feed.bozo:
-            logger.warning(
-                "Bozo feed; consider handling: %s",
-                feed.bozo_exception if "bozo_exception" in feed else None,
-            )
-
-        return feed
-
-
-class ArxivError(Exception):
-    """This package's base Exception class."""
-
-    url: str
-    """The feed URL that could not be fetched."""
-    retry: int
-    """
-    The request try number which encountered this error; 0 for the initial try,
-    1 for the first retry, and so on.
-    """
-    message: str
-    """Message describing what caused this error."""
-
-    def __init__(self, url: str, retry: int, message: str):
-        """
-        Constructs an `ArxivError` encountered while fetching the specified URL.
-        """
-        self.url = url
-        self.retry = retry
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self) -> str:
-        return "{} ({})".format(self.message, self.url)
-
-
-class UnexpectedEmptyPageError(ArxivError):
-    """
-    An error raised when a page of results that should be non-empty is empty.
-
-    This should never happen in theory, but happens sporadically due to
-    brittleness in the underlying arXiv API; usually resolved by retries.
-
-    See `Client.results` for usage.
-    """
-
-    raw_feed: feedparser.FeedParserDict
-    """
-    The raw output of `feedparser.parse`. Sometimes this contains useful
-    diagnostic information, e.g. in 'bozo_exception'.
-    """
-
-    def __init__(self, url: str, retry: int, raw_feed: feedparser.FeedParserDict):
-        """
-        Constructs an `UnexpectedEmptyPageError` encountered for the specified
-        API URL after `retry` tries.
-        """
-        self.url = url
-        self.raw_feed = raw_feed
-        super().__init__(url, retry, "Page of results was unexpectedly empty")
-
-    def __repr__(self) -> str:
-        return "{}({}, {}, {})".format(
-            _classname(self), repr(self.url), repr(self.retry), repr(self.raw_feed)
-        )
-
-
-class HTTPError(ArxivError):
-    """
-    A non-200 status encountered while fetching a page of results.
-
-    See `Client.results` for usage.
-    """
-
-    status: int
-    """The HTTP status reported by feedparser."""
-
-    def __init__(self, url: str, retry: int, status: int):
-        """
-        Constructs an `HTTPError` for the specified status code, encountered for
-        the specified API URL after `retry` tries.
-        """
-        self.url = url
-        self.status = status
-        super().__init__(
-            url,
-            retry,
-            "Page request resulted in HTTP {}".format(self.status),
-        )
-
-    def __repr__(self) -> str:
-        return "{}({}, {}, {})".format(
-            _classname(self), repr(self.url), repr(self.retry), repr(self.status)
-        )
-
-
-def _classname(o):
-    """A helper function for use in __repr__ methods: arxiv.Result.Link."""
-    return "arxiv.{}".format(o.__class__.__qualname__)
+warnings.warn("**Deprecated** after 2.0.0; use `import arxiv` instead.")
diff --git a/tests/test_package.py b/tests/test_package.py
new file mode 100644
index 0000000..f610838
--- /dev/null
+++ b/tests/test_package.py
@@ -0,0 +1,30 @@
+"""
+Tests for work-arounds to known arXiv API bugs.
+"""
+import unittest
+from typing import Set
+
+
+# ruff: noqa: F401
+class TestPackage(unittest.TestCase):
+    def get_public_classes(module: object) -> Set[str]:
+        """
+        Bodge: filter for the portion of the namespace that looks like exports.
+        """
+        return {name for name in dir(module) if name[0].isupper()}
+
+    def test_deprecated_import_pattern(self):
+        import arxiv as nondeprecated
+
+        expected = TestPackage.get_public_classes(nondeprecated)
+        self.assertTrue(
+            expected, "should export non-empty set of classes; check the helper"
+        )
+
+        from arxiv import arxiv as deprecated_from
+
+        self.assertSetEqual(expected, TestPackage.get_public_classes(deprecated_from))
+
+        import arxiv.arxiv as deprecated_dot
+
+        self.assertSetEqual(expected, TestPackage.get_public_classes(deprecated_dot))